In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, recall_score
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import RandomUnderSampler
import itertools
from torch.optim.lr_scheduler import ReduceLROnPlateau
In [2]:
from dataloader_creator import CreatorDL
creator = CreatorDL(seed=42, bs=2048)
In [3]:
df_UNSW = creator.reader("NF-UNSW-NB15-v3")
df_train_UNSW, df_test_UNSW, df_val_UNSW = creator.splitter(df_UNSW)
train_loader_UNSW, test_loader_UNSW, val_loader_UNSW = creator.balancer(df_train_UNSW, df_test_UNSW, df_val_UNSW)
Processando a categoria: 'Benign' -> Treino: 1118865 | Teste: 559433 | Validação: 559433 Processando a categoria: 'Fuzzers' -> Treino: 16908 | Teste: 8454 | Validação: 8454 Processando a categoria: 'Exploits' -> Treino: 21374 | Teste: 10687 | Validação: 10687 Processando a categoria: 'Backdoor' -> Treino: 2329 | Teste: 1165 | Validação: 1165 Processando a categoria: 'Reconnaissance' -> Treino: 8537 | Teste: 4268 | Validação: 4269 Processando a categoria: 'Generic' -> Treino: 9825 | Teste: 4913 | Validação: 4913 Processando a categoria: 'DoS' -> Treino: 2990 | Teste: 1495 | Validação: 1495 Processando a categoria: 'Shellcode' -> Treino: 1190 | Teste: 595 | Validação: 596 Processando a categoria: 'Analysis' -> Treino: 613 | Teste: 306 | Validação: 307 Processando a categoria: 'Worms' -> Treino: 79 | Teste: 39 | Validação: 40 --- Base de Treino --- Tamanho: 1182710 linhas Categorias presentes: ['Benign' 'Exploits' 'Reconnaissance' 'Fuzzers' 'DoS' 'Generic' 'Backdoor' 'Shellcode' 'Analysis' 'Worms'] Attack Benign 1118865 Exploits 21374 Fuzzers 16908 Generic 9825 Reconnaissance 8537 DoS 2990 Backdoor 2329 Shellcode 1190 Analysis 613 Worms 79 Name: count, dtype: int64 ------------------------- --- Base de Teste --- Tamanho: 591355 linhas Categorias presentes: ['Benign' 'Generic' 'DoS' 'Reconnaissance' 'Exploits' 'Fuzzers' 'Backdoor' 'Shellcode' 'Analysis' 'Worms'] Attack Benign 559433 Exploits 10687 Fuzzers 8454 Generic 4913 Reconnaissance 4268 DoS 1495 Backdoor 1165 Shellcode 595 Analysis 306 Worms 39 Name: count, dtype: int64 ------------------------- --- Base de Validação --- Tamanho: 591359 linhas Categorias presentes: ['Benign' 'Fuzzers' 'Reconnaissance' 'Exploits' 'Generic' 'Analysis' 'Shellcode' 'Backdoor' 'DoS' 'Worms'] Attack Benign 559433 Exploits 10687 Fuzzers 8454 Generic 4913 Reconnaissance 4269 DoS 1495 Backdoor 1165 Shellcode 596 Analysis 307 Worms 40 Name: count, dtype: int64 ------------------------- --- train --- Label 1 9000 0 9000 Name: count, dtype: int64 Attack Benign 9000 DoS 1000 Shellcode 1000 Generic 1000 Analysis 1000 Reconnaissance 1000 Fuzzers 1000 Worms 1000 Exploits 1000 Backdoor 1000 Name: count, dtype: int64 torch.Size([18000, 32]) (tensor([0, 1]), tensor([9000, 9000])) tensor(0.) tensor(1.) tensor(0.0516) ------------------------- --- test --- Label 1 9000 0 9000 Name: count, dtype: int64 Attack Benign 9000 DoS 1000 Shellcode 1000 Generic 1000 Analysis 1000 Reconnaissance 1000 Fuzzers 1000 Worms 1000 Exploits 1000 Backdoor 1000 Name: count, dtype: int64 torch.Size([18000, 32]) (tensor([0, 1]), tensor([9000, 9000])) tensor(-1.4981e-07) tensor(4.5768) tensor(0.0510) ------------------------- --- val --- Label 1 9000 0 9000 Name: count, dtype: int64 Attack Benign 9000 DoS 1000 Shellcode 1000 Generic 1000 Analysis 1000 Reconnaissance 1000 Fuzzers 1000 Worms 1000 Exploits 1000 Backdoor 1000 Name: count, dtype: int64 torch.Size([18000, 32]) (tensor([0, 1]), tensor([9000, 9000])) tensor(-2.9962e-07) tensor(3.5191) tensor(0.0518)
In [4]:
df_BOT= creator.reader("NF-BoT-IoT-v3")
df_train_BOT, df_test_BOT, df_val_BOT = creator.splitter(df_BOT)
train_loader_BOT, test_loader_BOT, val_loader_BOT = creator.balancer(df_train_BOT, df_test_BOT, df_val_BOT)
Processando a categoria: 'Benign' -> Treino: 25994 | Teste: 12997 | Validação: 12998 Processando a categoria: 'DDoS' -> Treino: 3575441 | Teste: 1787720 | Validação: 1787721 Processando a categoria: 'DoS' -> Treino: 4017095 | Teste: 2008547 | Validação: 2008548 Processando a categoria: 'Reconnaissance' -> Treino: 847566 | Teste: 423783 | Validação: 423783 Processando a categoria: 'Theft' -> Treino: 807 | Teste: 404 | Validação: 404 --- Base de Treino --- Tamanho: 8466903 linhas Categorias presentes: ['DDoS' 'DoS' 'Reconnaissance' 'Benign' 'Theft'] Attack DoS 4017095 DDoS 3575441 Reconnaissance 847566 Benign 25994 Theft 807 Name: count, dtype: int64 ------------------------- --- Base de Teste --- Tamanho: 4233451 linhas Categorias presentes: ['DDoS' 'DoS' 'Reconnaissance' 'Benign' 'Theft'] Attack DoS 2008547 DDoS 1787720 Reconnaissance 423783 Benign 12997 Theft 404 Name: count, dtype: int64 ------------------------- --- Base de Validação --- Tamanho: 4233454 linhas Categorias presentes: ['DoS' 'DDoS' 'Reconnaissance' 'Benign' 'Theft'] Attack DoS 2008548 DDoS 1787721 Reconnaissance 423783 Benign 12998 Theft 404 Name: count, dtype: int64 ------------------------- --- train --- Label 1 4000 0 4000 Name: count, dtype: int64 Attack Benign 4000 Reconnaissance 1000 DoS 1000 Theft 1000 DDoS 1000 Name: count, dtype: int64 torch.Size([8000, 32]) (tensor([0, 1]), tensor([4000, 4000])) tensor(0.) tensor(1.) tensor(0.0232) ------------------------- --- test --- Label 1 4000 0 4000 Name: count, dtype: int64 Attack Benign 4000 Reconnaissance 1000 DoS 1000 Theft 1000 DDoS 1000 Name: count, dtype: int64 torch.Size([8000, 32]) (tensor([0, 1]), tensor([4000, 4000])) tensor(-1.1910e-07) tensor(1.4751) tensor(0.0235) ------------------------- --- val --- Label 1 4000 0 4000 Name: count, dtype: int64 Attack Benign 4000 Reconnaissance 1000 DoS 1000 Theft 1000 DDoS 1000 Name: count, dtype: int64 torch.Size([8000, 32]) (tensor([0, 1]), tensor([4000, 4000])) tensor(-1.7865e-07) tensor(5.3125) tensor(0.0232)
In [5]:
df_CIC= creator.reader("NF-CICIDS2018-v3")
df_train_CIC, df_test_CIC, df_val_CIC = creator.splitter(df_CIC)
train_loader_CIC, test_loader_CIC, val_loader_CIC = creator.balancer(df_train_CIC, df_test_CIC, df_val_CIC)
Processando a categoria: 'Benign' -> Treino: 8757313 | Teste: 4378656 | Validação: 4378657 Processando a categoria: 'FTP-BruteForce' -> Treino: 193360 | Teste: 96680 | Validação: 96680 Processando a categoria: 'SSH-Bruteforce' -> Treino: 94237 | Teste: 47118 | Validação: 47119 Processando a categoria: 'DoS_attacks-GoldenEye' -> Treino: 30650 | Teste: 15325 | Validação: 15325 Processando a categoria: 'DoS_attacks-Slowloris' -> Treino: 18020 | Teste: 9010 | Validação: 9010 Processando a categoria: 'DoS_attacks-SlowHTTPTest' -> Treino: 52775 | Teste: 26387 | Validação: 26388 Processando a categoria: 'DoS_attacks-Hulk' -> Treino: 50038 | Teste: 25019 | Validação: 25019 Processando a categoria: 'DDoS_attacks-LOIC-HTTP' -> Treino: 144294 | Teste: 72147 | Validação: 72148 Processando a categoria: 'DDOS_attack-LOIC-UDP' -> Treino: 1725 | Teste: 862 | Validação: 863 Processando a categoria: 'DDOS_attack-HOIC' -> Treino: 516155 | Teste: 258078 | Validação: 258078 Processando a categoria: 'Brute_Force_-Web' -> Treino: 809 | Teste: 404 | Validação: 405 Processando a categoria: 'Brute_Force_-XSS' -> Treino: 240 | Teste: 120 | Validação: 120 Processando a categoria: 'SQL_Injection' -> Treino: 220 | Teste: 110 | Validação: 110 Processando a categoria: 'Infilteration' -> Treino: 94076 | Teste: 47038 | Validação: 47038 Processando a categoria: 'Bot' -> Treino: 103851 | Teste: 51926 | Validação: 51926 --- Base de Treino --- Tamanho: 10057763 linhas Categorias presentes: ['Benign' 'Infilteration' 'DDoS_attacks-LOIC-HTTP' 'DDOS_attack-HOIC' 'FTP-BruteForce' 'DoS_attacks-Hulk' 'Bot' 'DoS_attacks-GoldenEye' 'SSH-Bruteforce' 'DoS_attacks-SlowHTTPTest' 'DoS_attacks-Slowloris' 'Brute_Force_-Web' 'DDOS_attack-LOIC-UDP' 'Brute_Force_-XSS' 'SQL_Injection'] Attack Benign 8757313 DDOS_attack-HOIC 516155 FTP-BruteForce 193360 DDoS_attacks-LOIC-HTTP 144294 Bot 103851 SSH-Bruteforce 94237 Infilteration 94076 DoS_attacks-SlowHTTPTest 52775 DoS_attacks-Hulk 50038 DoS_attacks-GoldenEye 30650 DoS_attacks-Slowloris 18020 DDOS_attack-LOIC-UDP 1725 Brute_Force_-Web 809 Brute_Force_-XSS 240 SQL_Injection 220 Name: count, dtype: int64 ------------------------- --- Base de Teste --- Tamanho: 5028880 linhas Categorias presentes: ['Benign' 'Infilteration' 'DDOS_attack-HOIC' 'FTP-BruteForce' 'SSH-Bruteforce' 'DDoS_attacks-LOIC-HTTP' 'DDOS_attack-LOIC-UDP' 'Bot' 'DoS_attacks-GoldenEye' 'DoS_attacks-SlowHTTPTest' 'DoS_attacks-Hulk' 'DoS_attacks-Slowloris' 'Brute_Force_-Web' 'Brute_Force_-XSS' 'SQL_Injection'] Attack Benign 4378656 DDOS_attack-HOIC 258078 FTP-BruteForce 96680 DDoS_attacks-LOIC-HTTP 72147 Bot 51926 SSH-Bruteforce 47118 Infilteration 47038 DoS_attacks-SlowHTTPTest 26387 DoS_attacks-Hulk 25019 DoS_attacks-GoldenEye 15325 DoS_attacks-Slowloris 9010 DDOS_attack-LOIC-UDP 862 Brute_Force_-Web 404 Brute_Force_-XSS 120 SQL_Injection 110 Name: count, dtype: int64 ------------------------- --- Base de Validação --- Tamanho: 5028886 linhas Categorias presentes: ['Benign' 'FTP-BruteForce' 'DDoS_attacks-LOIC-HTTP' 'DDOS_attack-HOIC' 'Bot' 'SSH-Bruteforce' 'DoS_attacks-SlowHTTPTest' 'DoS_attacks-Hulk' 'Infilteration' 'DoS_attacks-GoldenEye' 'DoS_attacks-Slowloris' 'DDOS_attack-LOIC-UDP' 'Brute_Force_-XSS' 'Brute_Force_-Web' 'SQL_Injection'] Attack Benign 4378657 DDOS_attack-HOIC 258078 FTP-BruteForce 96680 DDoS_attacks-LOIC-HTTP 72148 Bot 51926 SSH-Bruteforce 47119 Infilteration 47038 DoS_attacks-SlowHTTPTest 26388 DoS_attacks-Hulk 25019 DoS_attacks-GoldenEye 15325 DoS_attacks-Slowloris 9010 DDOS_attack-LOIC-UDP 863 Brute_Force_-Web 405 Brute_Force_-XSS 120 SQL_Injection 110 Name: count, dtype: int64 ------------------------- --- train --- Label 0 14000 1 14000 Name: count, dtype: int64 Attack Benign 14000 DDoS_attacks-LOIC-HTTP 1000 Brute_Force_-Web 1000 FTP-BruteForce 1000 Infilteration 1000 SSH-Bruteforce 1000 DoS_attacks-GoldenEye 1000 DoS_attacks-SlowHTTPTest 1000 DoS_attacks-Slowloris 1000 DDOS_attack-LOIC-UDP 1000 DoS_attacks-Hulk 1000 SQL_Injection 1000 Bot 1000 DDOS_attack-HOIC 1000 Brute_Force_-XSS 1000 Name: count, dtype: int64 torch.Size([28000, 32]) (tensor([0, 1]), tensor([14000, 14000])) tensor(0.) tensor(1.) tensor(0.0473) ------------------------- --- test --- Label 0 14000 1 14000 Name: count, dtype: int64 Attack Benign 14000 DDoS_attacks-LOIC-HTTP 1000 Brute_Force_-Web 1000 FTP-BruteForce 1000 Infilteration 1000 SSH-Bruteforce 1000 DoS_attacks-GoldenEye 1000 DoS_attacks-SlowHTTPTest 1000 DoS_attacks-Slowloris 1000 DDOS_attack-LOIC-UDP 1000 DoS_attacks-Hulk 1000 SQL_Injection 1000 Bot 1000 DDOS_attack-HOIC 1000 Brute_Force_-XSS 1000 Name: count, dtype: int64 torch.Size([28000, 32]) (tensor([0, 1]), tensor([14000, 14000])) tensor(0.) tensor(1.4776) tensor(0.0477) ------------------------- --- val --- Label 0 14000 1 14000 Name: count, dtype: int64 Attack Benign 14000 DDoS_attacks-LOIC-HTTP 1000 Brute_Force_-Web 1000 FTP-BruteForce 1000 Infilteration 1000 SSH-Bruteforce 1000 DoS_attacks-GoldenEye 1000 DoS_attacks-SlowHTTPTest 1000 DoS_attacks-Slowloris 1000 DDOS_attack-LOIC-UDP 1000 DoS_attacks-Hulk 1000 SQL_Injection 1000 Bot 1000 DDOS_attack-HOIC 1000 Brute_Force_-XSS 1000 Name: count, dtype: int64 torch.Size([28000, 32]) (tensor([0, 1]), tensor([14000, 14000])) tensor(0.) tensor(2.7903) tensor(0.0478)
In [6]:
train_loaders = [train_loader_UNSW, train_loader_BOT, train_loader_CIC]
test_loaders = [test_loader_UNSW, test_loader_BOT, test_loader_CIC]
val_loaders = [val_loader_UNSW, val_loader_BOT, val_loader_CIC]
In [40]:
INPUT_DIM = 32
class IDSBranchyNet(nn.Module):
def __init__(self, input_dim=INPUT_DIM, num_classes=2):
super(IDSBranchyNet, self).__init__()
self.shared_layers = nn.Sequential(
nn.Linear(input_dim, input_dim * 2),
nn.ReLU(),
)
self.exit1_layers = nn.Sequential(
nn.Linear(input_dim * 2, num_classes)
)
self.exit2_layers = nn.Sequential(
nn.Linear(input_dim * 2, 1024),
nn.LeakyReLU(),
nn.Dropout(0.2),
nn.Linear(1024, 2048),
nn.LeakyReLU(),
nn.Dropout(0.2),
nn.Linear(2048, 2048),
nn.LeakyReLU(),
nn.Dropout(0.2),
nn.Linear(2048, 1024),
nn.LeakyReLU(),
nn.Dropout(0.2),
nn.Linear(1024, num_classes)
)
def forward_exit1(self, x):
features = self.shared_layers(x)
return self.exit1_layers(features)
def forward_exit2(self, x):
features = self.shared_layers(x)
return self.exit2_layers(features)
model = IDSBranchyNet()
In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
Using device: cuda
In [42]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
import itertools
import matplotlib.pyplot as plt
import numpy as np
def train_model(model, train_loaders, val_loaders, epochs, lr, device, current_threshold, patience=15):
print(f"\n[INIT] --- MODO DEBUG EXTREMO ATIVADO (CORRIGIDO) ---")
print(f"[INIT] Device: {device} | LR: {lr} | Threshold: {current_threshold}")
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.001, patience=7)
model.to(device)
criterion = nn.CrossEntropyLoss()
metrics = [
'loss1_a', 'loss1_b', 'loss1_c', 'loss_ex1_avg',
'loss2_a', 'loss2_b', 'loss2_c', 'loss_ex2_avg',
'l_joint', 'total_loss'
]
history = {
'train': {k: [] for k in metrics},
'val': {k: [] for k in metrics}
}
best_val_loss = float('inf')
epochs_no_improve = 0
best_model_state = None
max_train_batches = max(len(l) for l in train_loaders)
train_iter_loaders = [itertools.cycle(l) if len(l) < max_train_batches else l for l in train_loaders]
max_val_batches = max(len(l) for l in val_loaders)
val_iter_loaders = [itertools.cycle(l) if len(l) < max_val_batches else l for l in val_loaders]
for epoch in range(epochs):
print(f"\n{'#'*30} EPOCH {epoch+1}/{epochs} START {'#'*30}")
model.train()
running_metrics = {k: 0.0 for k in metrics}
total_steps = 0
loader_iterators = [iter(l) for l in train_iter_loaders]
for batch_idx in range(max_train_batches):
print(f"\n>>> [TRAIN] BATCH {batch_idx} START <<<")
try:
batches = [next(it) for it in loader_iterators]
except StopIteration:
print("[DEBUG] StopIteration atingido.")
break
optimizer.zero_grad()
(inputs_a, labels_a) = batches[0]
(inputs_b, labels_b) = batches[1]
(inputs_c, labels_c) = batches[2]
inputs_a, labels_a = inputs_a.to(device), labels_a.to(device)
inputs_b, labels_b = inputs_b.to(device), labels_b.to(device)
inputs_c, labels_c = inputs_c.to(device), labels_c.to(device)
# --- DEBUG DE DADOS DE ENTRADA ---
print(f"[DATA A] Shape: {inputs_a.shape} | Mean: {inputs_a.mean():.3f} | Std: {inputs_a.std():.3f} | Min: {inputs_a.min():.3f} | Max: {inputs_a.max():.3f}")
# --- FORWARD EXIT 1 ---
out1_a = model.forward_exit1(inputs_a)
out1_b = model.forward_exit1(inputs_b)
out1_c = model.forward_exit1(inputs_c)
# --- DEBUG LOGITS ---
print(f"[LOGITS Ex1 A] Mean Abs: {out1_a.abs().mean():.3f} | Max: {out1_a.max():.3f}")
probs_a = F.softmax(out1_a, dim=1)
conf_a, _ = torch.max(probs_a, dim=1)
probs_b = F.softmax(out1_b, dim=1)
conf_b, _ = torch.max(probs_b, dim=1)
probs_c = F.softmax(out1_c, dim=1)
conf_c, _ = torch.max(probs_c, dim=1)
# --- DEBUG PROBABILIDADES DETALHADAS (CORRIGIDO) ---
# Determina o numero de classes dinamicamente para evitar erro se classes < 3
num_classes = probs_a.size(1)
k_val = min(3, num_classes)
top_k_prob, top_k_idx = torch.topk(probs_a[0], k_val)
print(f"[SAMPLE 0 PREDICTION A] Top{k_val} Probs: {top_k_prob.detach().cpu().numpy()} | Indices: {top_k_idx.detach().cpu().numpy()} | Label Real: {labels_a[0].item()}")
print(f"[CONFIDENCE A] Mean: {conf_a.mean().item():.3f} | Std: {conf_a.std().item():.3f}")
mask_a_ex1 = conf_a > current_threshold
mask_b_ex1 = conf_b > current_threshold
mask_c_ex1 = conf_c > current_threshold
mask_a_ex2 = conf_a <= current_threshold
mask_b_ex2 = conf_b <= current_threshold
mask_c_ex2 = conf_c <= current_threshold
print(f"[MASKS] A(Pass/Fail): {mask_a_ex1.sum()}/{mask_a_ex2.sum()} | B: {mask_b_ex1.sum()}/{mask_b_ex2.sum()} | C: {mask_c_ex1.sum()}/{mask_c_ex2.sum()}")
# --- LOSS EXIT 1 ---
if mask_a_ex1.any():
loss1_a = criterion(out1_a, labels_a)
else:
loss1_a = 0.0 * out1_a.sum()
if mask_b_ex1.any():
loss1_b = criterion(out1_b, labels_b)
else:
loss1_b = 0.0 * out1_b.sum()
if mask_c_ex1.any():
loss1_c = criterion(out1_c, labels_c)
else:
loss1_c = 0.0 * out1_c.sum()
loss_ex1_avg = (loss1_a + loss1_b + loss1_c) / 3
print(f"[LOSS Ex1] A: {loss1_a.item():.5f} | B: {loss1_b.item():.5f} | C: {loss1_c.item():.5f}")
# --- FORWARD EXIT 2 ---
out2_a = model.forward_exit2(inputs_a)
out2_b = model.forward_exit2(inputs_b)
out2_c = model.forward_exit2(inputs_c)
# Debug Logits Ex2
print(f"[LOGITS Ex2 A] Mean Abs: {out2_a.abs().mean():.3f} | Max: {out2_a.max():.3f}")
if mask_a_ex2.any():
loss2_a = criterion(out2_a, labels_a)
else:
loss2_a = 0.0 * out2_a.sum()
if mask_b_ex2.any():
loss2_b = criterion(out2_b, labels_b)
else:
loss2_b = 0.0 * out2_b.sum()
if mask_c_ex2.any():
loss2_c = criterion(out2_c, labels_c)
else:
loss2_c = 0.0 * out2_c.sum()
loss_ex2_avg = (loss2_a + loss2_b + loss2_c) / 3
print(f"[LOSS Ex2] A: {loss2_a.item():.5f} | B: {loss2_b.item():.5f} | C: {loss2_c.item():.5f}")
l_joint = loss_ex1_avg + loss_ex2_avg
print(f"** [JOINT LOSS] ** : {l_joint.item():.6f}")
if torch.isnan(l_joint):
print("!!!!!!!!!! LOSS IS NAN !!!!!!!!!!")
return current_threshold
l_joint.backward()
# --- DEBUG DE GRADIENTES POR CAMADA ---
print(f"[GRADIENTS CHECK]")
has_grads = False
for name, param in model.named_parameters():
if param.grad is not None:
grad_mean = param.grad.abs().mean().item()
grad_max = param.grad.abs().max().item()
print(f" -> Layer: {name} | Grad Mean: {grad_mean:.6f} | Grad Max: {grad_max:.6f}")
has_grads = True
if not has_grads:
print("!!! NENHUM GRADIENTE ENCONTRADO EM TODO O MODELO !!!")
total_norm = 0
for p in model.parameters():
if p.grad is not None:
param_norm = p.grad.data.norm(2)
total_norm += param_norm.item() ** 2
total_norm = total_norm ** 0.5
print(f"[GRADIENT NORM TOTAL] {total_norm:.4f}")
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
running_metrics['loss1_a'] += loss1_a.item()
running_metrics['loss1_b'] += loss1_b.item()
running_metrics['loss1_c'] += loss1_c.item()
running_metrics['loss_ex1_avg'] += loss_ex1_avg.item()
running_metrics['loss2_a'] += loss2_a.item()
running_metrics['loss2_b'] += loss2_b.item()
running_metrics['loss2_c'] += loss2_c.item()
running_metrics['loss_ex2_avg'] += loss_ex2_avg.item()
running_metrics['l_joint'] += l_joint.item()
total_steps += 1
for key in metrics:
history['train'][key].append(running_metrics[key] / total_steps)
epoch_train_loss = history['train']['l_joint'][-1]
print(f"\n[EPOCH SUMMARY] Train Loss: {epoch_train_loss:.4f}")
# --- VALIDATION ---
print(f"\n[VALIDATION] Starting...")
model.eval()
running_metrics_val = {k: 0.0 for k in metrics}
total_steps_val = 0
val_loader_iterators = [iter(l) for l in val_iter_loaders]
with torch.no_grad():
for batch_val_idx in range(max_val_batches):
try:
batches = [next(it) for it in val_loader_iterators]
except StopIteration:
break
if batch_val_idx == 0:
print("[VAL] Processando primeiro batch de validação...")
(inputs_a, labels_a) = batches[0]
(inputs_b, labels_b) = batches[1]
(inputs_c, labels_c) = batches[2]
inputs_a, labels_a = inputs_a.to(device), labels_a.to(device)
inputs_b, labels_b = inputs_b.to(device), labels_b.to(device)
inputs_c, labels_c = inputs_c.to(device), labels_c.to(device)
out1_a = model.forward_exit1(inputs_a)
out1_b = model.forward_exit1(inputs_b)
out1_c = model.forward_exit1(inputs_c)
probs_a = F.softmax(out1_a, dim=1)
conf_a, _ = torch.max(probs_a, dim=1)
probs_b = F.softmax(out1_b, dim=1)
conf_b, _ = torch.max(probs_b, dim=1)
probs_c = F.softmax(out1_c, dim=1)
conf_c, _ = torch.max(probs_c, dim=1)
mask_a_ex1 = conf_a > current_threshold
mask_b_ex1 = conf_b > current_threshold
mask_c_ex1 = conf_c > current_threshold
mask_a_ex2 = conf_a <= current_threshold
mask_b_ex2 = conf_b <= current_threshold
mask_c_ex2 = conf_c <= current_threshold
if mask_a_ex1.any():
loss1_a = criterion(out1_a, labels_a)
else:
loss1_a = 0.0 * out1_a.sum()
if mask_b_ex1.any():
loss1_b = criterion(out1_b, labels_b)
else:
loss1_b = 0.0 * out1_b.sum()
if mask_c_ex1.any():
loss1_c = criterion(out1_c, labels_c)
else:
loss1_c = 0.0 * out1_c.sum()
loss_ex1_avg = (loss1_a + loss1_b + loss1_c) / 3
out2_a = model.forward_exit2(inputs_a)
out2_b = model.forward_exit2(inputs_b)
out2_c = model.forward_exit2(inputs_c)
if mask_a_ex2.any():
loss2_a = criterion(out2_a, labels_a)
else:
loss2_a = 0.0 * out2_a.sum()
if mask_b_ex2.any():
loss2_b = criterion(out2_b, labels_b)
else:
loss2_b = 0.0 * out2_b.sum()
if mask_c_ex2.any():
loss2_c = criterion(out2_c, labels_c)
else:
loss2_c = 0.0 * out2_c.sum()
loss_ex2_avg = (loss2_a + loss2_b + loss2_c) / 3
l_joint = loss_ex1_avg + loss_ex2_avg
running_metrics_val['loss1_a'] += loss1_a.item()
running_metrics_val['loss1_b'] += loss1_b.item()
running_metrics_val['loss1_c'] += loss1_c.item()
running_metrics_val['loss_ex1_avg'] += loss_ex1_avg.item()
running_metrics_val['loss2_a'] += loss2_a.item()
running_metrics_val['loss2_b'] += loss2_b.item()
running_metrics_val['loss2_c'] += loss2_c.item()
running_metrics_val['loss_ex2_avg'] += loss_ex2_avg.item()
running_metrics_val['l_joint'] += l_joint.item()
total_steps_val += 1
for key in metrics:
history['val'][key].append(running_metrics_val[key] / total_steps_val)
epoch_val_loss = history['val']['l_joint'][-1]
thresh_print = current_threshold.item() if isinstance(current_threshold, torch.Tensor) else current_threshold
print(f'[EPOCH END] Val Loss: {epoch_val_loss:.4f} | Alpha: {thresh_print:.4f}')
if epoch_val_loss < best_val_loss:
print(f"!!! BEST MODEL SAVED !!! (Old: {best_val_loss:.4f} -> New: {epoch_val_loss:.4f})")
best_val_loss = epoch_val_loss
epochs_no_improve = 0
best_model_state = model.state_dict()
else:
epochs_no_improve += 1
print(f"No improve count: {epochs_no_improve}/{patience}")
if epochs_no_improve >= patience:
print("EARLY STOPPING TRIGGERED")
if best_model_state: model.load_state_dict(best_model_state)
break
scheduler.step(epoch_val_loss)
epochs_range = range(1, len(history['train']['l_joint']) + 1)
# Plotting (mantido igual)
fig, axs = plt.subplots(1, 3, figsize=(20, 6))
ax = axs[0]
ax.set_title("Exit 1")
ax.plot(epochs_range, history['train']['loss1_a'], label='Tr A', alpha=0.6)
ax.plot(epochs_range, history['train']['loss1_b'], label='Tr B', alpha=0.6)
ax.plot(epochs_range, history['train']['loss1_c'], label='Tr C', alpha=0.6)
ax.plot(epochs_range, history['train']['loss_ex1_avg'], label='Tr Avg', linewidth=2)
ax.plot(epochs_range, history['val']['loss1_a'], label='Val A', color='black', linestyle='--')
ax.plot(epochs_range, history['val']['loss1_b'], label='Val B', color='black', linestyle='--')
ax.plot(epochs_range, history['val']['loss1_c'], label='Val C', color='black', linestyle='--')
ax.plot(epochs_range, history['val']['loss_ex1_avg'], label='Val Avg', color='black', linestyle='--', linewidth=2)
ax.set_xlabel('Epochs')
ax.set_ylabel('Loss')
ax.legend()
ax.grid(True)
ax = axs[1]
ax.set_title("Exit 2")
ax.plot(epochs_range, history['train']['loss2_a'], label='Tr A', color='blue', alpha=0.6)
ax.plot(epochs_range, history['train']['loss2_b'], label='Tr B', color='green', alpha=0.6)
ax.plot(epochs_range, history['train']['loss2_c'], label='Tr C', color='red', alpha=0.6)
ax.plot(epochs_range, history['train']['loss_ex2_avg'], label='Tr Avg', color='black', linewidth=2)
ax.plot(epochs_range, history['val']['loss2_a'], label='Val A', color='black', linestyle='--')
ax.plot(epochs_range, history['val']['loss2_b'], label='Val B', color='black', linestyle='--')
ax.plot(epochs_range, history['val']['loss2_c'], label='Val C', color='black', linestyle='--')
ax.plot(epochs_range, history['val']['loss_ex2_avg'], label='Val Avg', color='black', linestyle='--', linewidth=2)
ax.set_xlabel('Epochs')
ax.legend()
ax.grid(True)
ax = axs[2]
ax.set_title("Global Optimization")
ax.plot(epochs_range, history['train']['l_joint'], label='Tr Joint (Ex1 + Ex2)', color='purple')
ax.plot(epochs_range, history['val']['l_joint'], label='Val Joint', color='purple', linestyle='--')
ax.set_xlabel('Epochs')
ax.legend()
ax.grid(True)
plt.tight_layout()
plt.show()
return current_threshold
In [43]:
def evaluate_model(model, loader, confidence_threshold, device):
model.to(device)
model.eval()
total_samples = len(loader.dataset)
all_predictions = []
all_labels = []
exited_early_count = 0
total_inference_time = 0
with torch.no_grad():
for samples, labels in loader:
samples, labels = samples.to(device), labels.to(device)
torch.cuda.synchronize()
start_time = time.perf_counter()
branch_output = model.forward_exit1(samples)
branch_prob = F.softmax(branch_output, dim=1)
trusts, branch_preds = torch.max(branch_prob, 1)
batch_predictions = torch.zeros_like(labels)
early_exit_mask = trusts > confidence_threshold
if early_exit_mask.any():
batch_predictions[early_exit_mask] = branch_preds[early_exit_mask]
exited_early_count += early_exit_mask.sum().item()
main_branch_mask = ~early_exit_mask
if main_branch_mask.any():
samples_to_main = samples[main_branch_mask]
main_output = model.forward_exit2(samples_to_main)
main_prob = F.softmax(main_output, dim=1)
_, main_preds = torch.max(main_prob, 1)
batch_predictions[main_branch_mask] = main_preds
torch.cuda.synchronize()
end_time = time.perf_counter()
total_inference_time += (end_time - start_time)
all_predictions.append(batch_predictions.cpu())
all_labels.append(labels.cpu())
final_predictions = torch.cat(all_predictions)
y_data = torch.cat(all_labels)
correct = (final_predictions == y_data).sum().item()
accuracy = 100 * correct / total_samples
exit_rate = 100 * exited_early_count / total_samples
avg_time_ms = (total_inference_time / total_samples) * 1000
cm = confusion_matrix(y_data.numpy(), final_predictions.numpy())
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Normal', 'Ataque'],
yticklabels=['Normal', 'Ataque'])
plt.xlabel('Rótulo Previsto')
plt.ylabel('Rótulo Verdadeiro')
plt.title(f'Matriz de Confusão (Limiar de Confiança = {confidence_threshold})')
plt.show()
tn, fp, fn, tp = cm.ravel()
f1 = f1_score(y_data.numpy(), final_predictions.numpy())
tpr = recall_score(y_data.numpy(), final_predictions.numpy())
tnr = tn / (tn + fp) if (tn + fp) > 0 else 0
print(f"True Positives (TP): {tp}")
print(f"True Negatives (TN): {tn}")
print(f"False Positives (FP): {fp}")
print(f"False Negatives (FN): {fn}\n")
print(f"F1 Score: {f1:.4f}")
print(f"True Positive Rate (TPR) / Recall: {tpr:.4f}")
print(f"True Negative Rate (TNR) / Specificity: {tnr:.4f}")
return {
'accuracy': accuracy,
'exit_rate': exit_rate,
'avg_inference_time_ms': avg_time_ms,
'exited_early_count': exited_early_count,
'total_samples': total_samples,
'f1': f1
}
In [44]:
modelname = 'teste_ljoint9'
modelname
Out[44]:
'teste_ljoint9'
In [45]:
epochs = 500
limiar = train_model(
model,
train_loaders,
val_loaders,
epochs,
current_threshold=0.55,
lr=0.0001,
device=device
)
torch.save(model.state_dict(), f'models/{modelname}.pth')
print(f"\nModelo treinado e salvo em 'models/{modelname}.pth'")
[INIT] --- MODO DEBUG EXTREMO ATIVADO (CORRIGIDO) --- [INIT] Device: cuda | LR: 0.0001 | Threshold: 0.55 ############################## EPOCH 1/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.146 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5263452 0.47365478] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.004 [MASKS] A(Pass/Fail): 4/2044 | B: 107/1941 | C: 260/1788 [LOSS Ex1] A: 0.69271 | B: 0.69443 | C: 0.68963 [LOGITS Ex2 A] Mean Abs: 0.037 | Max: 0.047 [LOSS Ex2] A: 0.69331 | B: 0.69367 | C: 0.69326 ** [JOINT LOSS] ** : 1.385670 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000274 | Grad Max: 0.003790 -> Layer: shared_layers.0.bias | Grad Mean: 0.001401 | Grad Max: 0.006197 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.011667 -> Layer: exit1_layers.0.bias | Grad Mean: 0.031979 | Grad Max: 0.031979 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000001 | Grad Max: 0.000033 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000015 | Grad Max: 0.000141 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000001 | Grad Max: 0.000054 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000022 | Grad Max: 0.000202 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000001 | Grad Max: 0.000064 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000047 | Grad Max: 0.000366 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000080 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000151 | Grad Max: 0.001195 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000096 | Grad Max: 0.000669 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013826 | Grad Max: 0.013826 [GRADIENT NORM TOTAL] 0.0712 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.148 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51929814 0.48070186] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.004 [MASKS] A(Pass/Fail): 0/2048 | B: 110/1938 | C: 242/1806 [LOSS Ex1] A: 0.00000 | B: 0.69431 | C: 0.68802 [LOGITS Ex2 A] Mean Abs: 0.026 | Max: 0.036 [LOSS Ex2] A: 0.69189 | B: 0.69314 | C: 0.69393 ** [JOINT LOSS] ** : 1.153760 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000168 | Grad Max: 0.002961 -> Layer: shared_layers.0.bias | Grad Mean: 0.000797 | Grad Max: 0.003885 -> Layer: exit1_layers.0.weight | Grad Mean: 0.000948 | Grad Max: 0.004361 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013003 | Grad Max: 0.013003 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000002 | Grad Max: 0.000042 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000017 | Grad Max: 0.000164 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000001 | Grad Max: 0.000061 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000023 | Grad Max: 0.000220 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000001 | Grad Max: 0.000061 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000048 | Grad Max: 0.000389 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000093 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000151 | Grad Max: 0.001173 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000101 | Grad Max: 0.000873 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014207 | Grad Max: 0.014207 [GRADIENT NORM TOTAL] 0.0406 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.175 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52613604 0.473864 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 88/1960 | C: 274/1774 [LOSS Ex1] A: 0.69191 | B: 0.69315 | C: 0.68704 [LOGITS Ex2 A] Mean Abs: 0.015 | Max: 0.028 [LOSS Ex2] A: 0.69154 | B: 0.69260 | C: 0.69273 ** [JOINT LOSS] ** : 1.382988 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000293 | Grad Max: 0.003905 -> Layer: shared_layers.0.bias | Grad Mean: 0.001196 | Grad Max: 0.004533 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001968 | Grad Max: 0.011349 -> Layer: exit1_layers.0.bias | Grad Mean: 0.021428 | Grad Max: 0.021428 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000002 | Grad Max: 0.000051 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000019 | Grad Max: 0.000136 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000001 | Grad Max: 0.000100 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000024 | Grad Max: 0.000342 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000001 | Grad Max: 0.000077 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000045 | Grad Max: 0.000403 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000103 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000127 | Grad Max: 0.001046 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000100 | Grad Max: 0.000727 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011559 | Grad Max: 0.011559 [GRADIENT NORM TOTAL] 0.0605 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.166 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5268466 0.47315347] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 99/1757 | C: 238/1810 [LOSS Ex1] A: 0.69059 | B: 0.69446 | C: 0.69029 [LOGITS Ex2 A] Mean Abs: 0.010 | Max: 0.021 [LOSS Ex2] A: 0.69096 | B: 0.69233 | C: 0.69240 ** [JOINT LOSS] ** : 1.383674 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000250 | Grad Max: 0.002665 -> Layer: shared_layers.0.bias | Grad Mean: 0.001150 | Grad Max: 0.005426 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001719 | Grad Max: 0.009803 -> Layer: exit1_layers.0.bias | Grad Mean: 0.024030 | Grad Max: 0.024030 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000002 | Grad Max: 0.000066 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000018 | Grad Max: 0.000184 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000001 | Grad Max: 0.000061 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000021 | Grad Max: 0.000189 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000001 | Grad Max: 0.000050 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000035 | Grad Max: 0.000290 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000001 | Grad Max: 0.000077 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000067 | Grad Max: 0.000948 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000067 | Grad Max: 0.000486 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002856 | Grad Max: 0.002856 [GRADIENT NORM TOTAL] 0.0547 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.141 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5166012 0.48339874] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.005 [MASKS] A(Pass/Fail): 2/2046 | B: 107/1941 | C: 236/1812 [LOSS Ex1] A: 0.69022 | B: 0.69415 | C: 0.69074 [LOGITS Ex2 A] Mean Abs: 0.012 | Max: 0.024 [LOSS Ex2] A: 0.68936 | B: 0.69219 | C: 0.69213 ** [JOINT LOSS] ** : 1.382931 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000262 | Grad Max: 0.003296 -> Layer: shared_layers.0.bias | Grad Mean: 0.001157 | Grad Max: 0.005331 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001834 | Grad Max: 0.010335 -> Layer: exit1_layers.0.bias | Grad Mean: 0.023048 | Grad Max: 0.023048 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000003 | Grad Max: 0.000088 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000025 | Grad Max: 0.000268 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000001 | Grad Max: 0.000066 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000027 | Grad Max: 0.000210 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000001 | Grad Max: 0.000062 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000043 | Grad Max: 0.000396 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000101 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000078 | Grad Max: 0.001290 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000075 | Grad Max: 0.000533 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001089 | Grad Max: 0.001089 [GRADIENT NORM TOTAL] 0.0555 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.140 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.515395 0.48460504] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 110/1938 | C: 241/1807 [LOSS Ex1] A: 0.69316 | B: 0.69404 | C: 0.69074 [LOGITS Ex2 A] Mean Abs: 0.016 | Max: 0.031 [LOSS Ex2] A: 0.68849 | B: 0.69175 | C: 0.69200 ** [JOINT LOSS] ** : 1.383394 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000283 | Grad Max: 0.003826 -> Layer: shared_layers.0.bias | Grad Mean: 0.001676 | Grad Max: 0.007289 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.011786 -> Layer: exit1_layers.0.bias | Grad Mean: 0.036556 | Grad Max: 0.036556 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000004 | Grad Max: 0.000133 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000049 | Grad Max: 0.000529 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000002 | Grad Max: 0.000105 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000054 | Grad Max: 0.000388 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000002 | Grad Max: 0.000120 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000092 | Grad Max: 0.000808 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000136 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000227 | Grad Max: 0.001978 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000132 | Grad Max: 0.001007 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017831 | Grad Max: 0.017831 [GRADIENT NORM TOTAL] 0.0817 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.171 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5218072 0.47819284] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 88/1960 | C: 240/1808 [LOSS Ex1] A: 0.69208 | B: 0.69287 | C: 0.69087 [LOGITS Ex2 A] Mean Abs: 0.018 | Max: 0.045 [LOSS Ex2] A: 0.68734 | B: 0.69096 | C: 0.69108 ** [JOINT LOSS] ** : 1.381736 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000286 | Grad Max: 0.003647 -> Layer: shared_layers.0.bias | Grad Mean: 0.001519 | Grad Max: 0.006798 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.011025 -> Layer: exit1_layers.0.bias | Grad Mean: 0.031033 | Grad Max: 0.031033 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000004 | Grad Max: 0.000154 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000051 | Grad Max: 0.000728 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000002 | Grad Max: 0.000102 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000049 | Grad Max: 0.000371 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000002 | Grad Max: 0.000115 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000077 | Grad Max: 0.000714 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000135 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000174 | Grad Max: 0.001524 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000119 | Grad Max: 0.000896 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011617 | Grad Max: 0.011617 [GRADIENT NORM TOTAL] 0.0710 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.097 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5116345 0.48836553] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 98/1758 | C: 264/1784 [LOSS Ex1] A: 0.00000 | B: 0.69418 | C: 0.69090 [LOGITS Ex2 A] Mean Abs: 0.022 | Max: 0.065 [LOSS Ex2] A: 0.68536 | B: 0.69033 | C: 0.69078 ** [JOINT LOSS] ** : 1.150513 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000192 | Grad Max: 0.002905 -> Layer: shared_layers.0.bias | Grad Mean: 0.001436 | Grad Max: 0.006228 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001276 | Grad Max: 0.004786 -> Layer: exit1_layers.0.bias | Grad Mean: 0.024602 | Grad Max: 0.024602 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000005 | Grad Max: 0.000162 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000049 | Grad Max: 0.000580 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000002 | Grad Max: 0.000108 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000043 | Grad Max: 0.000413 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000002 | Grad Max: 0.000086 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000063 | Grad Max: 0.000583 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000120 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000133 | Grad Max: 0.001281 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000142 | Grad Max: 0.000974 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007047 | Grad Max: 0.007047 [GRADIENT NORM TOTAL] 0.0541 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.107 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51330423 0.4866957 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.005 [MASKS] A(Pass/Fail): 0/1616 | B: 107/1941 | C: 253/1795 [LOSS Ex1] A: 0.00000 | B: 0.69387 | C: 0.68930 [LOGITS Ex2 A] Mean Abs: 0.030 | Max: 0.089 [LOSS Ex2] A: 0.68325 | B: 0.68952 | C: 0.68942 ** [JOINT LOSS] ** : 1.148453 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000196 | Grad Max: 0.003207 -> Layer: shared_layers.0.bias | Grad Mean: 0.001058 | Grad Max: 0.004890 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001090 | Grad Max: 0.004547 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018956 | Grad Max: 0.018956 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000007 | Grad Max: 0.000166 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000060 | Grad Max: 0.000562 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000003 | Grad Max: 0.000149 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000048 | Grad Max: 0.000532 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000002 | Grad Max: 0.000115 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000065 | Grad Max: 0.000572 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000117 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000114 | Grad Max: 0.001075 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000219 | Grad Max: 0.001389 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006146 | Grad Max: 0.006146 [GRADIENT NORM TOTAL] 0.0487 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.150 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52465856 0.47534147] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.005 [MASKS] A(Pass/Fail): 4/2044 | B: 108/1940 | C: 241/1807 [LOSS Ex1] A: 0.69178 | B: 0.69377 | C: 0.68912 [LOGITS Ex2 A] Mean Abs: 0.039 | Max: 0.120 [LOSS Ex2] A: 0.68193 | B: 0.68846 | C: 0.68912 ** [JOINT LOSS] ** : 1.378061 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000291 | Grad Max: 0.004099 -> Layer: shared_layers.0.bias | Grad Mean: 0.001262 | Grad Max: 0.005157 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001883 | Grad Max: 0.010712 -> Layer: exit1_layers.0.bias | Grad Mean: 0.024290 | Grad Max: 0.024290 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000007 | Grad Max: 0.000185 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000074 | Grad Max: 0.000632 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000003 | Grad Max: 0.000184 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000059 | Grad Max: 0.000491 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000002 | Grad Max: 0.000102 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000075 | Grad Max: 0.000725 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000144 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000127 | Grad Max: 0.001376 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000244 | Grad Max: 0.001452 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006685 | Grad Max: 0.006685 [GRADIENT NORM TOTAL] 0.0648 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.152 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5175542 0.48244575] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 88/1960 | C: 238/1810 [LOSS Ex1] A: 0.00000 | B: 0.69258 | C: 0.68859 [LOGITS Ex2 A] Mean Abs: 0.046 | Max: 0.141 [LOSS Ex2] A: 0.67841 | B: 0.68664 | C: 0.68757 ** [JOINT LOSS] ** : 1.144593 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000217 | Grad Max: 0.003782 -> Layer: shared_layers.0.bias | Grad Mean: 0.001008 | Grad Max: 0.004703 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001011 | Grad Max: 0.004472 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015994 | Grad Max: 0.015994 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000009 | Grad Max: 0.000239 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000079 | Grad Max: 0.000897 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000003 | Grad Max: 0.000161 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000060 | Grad Max: 0.000453 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000002 | Grad Max: 0.000112 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000076 | Grad Max: 0.000701 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000166 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000129 | Grad Max: 0.001627 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000310 | Grad Max: 0.001743 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004185 | Grad Max: 0.004185 [GRADIENT NORM TOTAL] 0.0500 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.176 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5242592 0.47574073] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 97/1759 | C: 243/1805 [LOSS Ex1] A: 0.69096 | B: 0.69390 | C: 0.69062 [LOGITS Ex2 A] Mean Abs: 0.053 | Max: 0.173 [LOSS Ex2] A: 0.67476 | B: 0.68592 | C: 0.68723 ** [JOINT LOSS] ** : 1.374464 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000313 | Grad Max: 0.004346 -> Layer: shared_layers.0.bias | Grad Mean: 0.001491 | Grad Max: 0.006072 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.011524 -> Layer: exit1_layers.0.bias | Grad Mean: 0.029302 | Grad Max: 0.029302 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000010 | Grad Max: 0.000249 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000090 | Grad Max: 0.000977 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000004 | Grad Max: 0.000200 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000063 | Grad Max: 0.000531 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000003 | Grad Max: 0.000121 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000077 | Grad Max: 0.000625 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000133 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000139 | Grad Max: 0.001316 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000346 | Grad Max: 0.001697 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001239 | Grad Max: 0.001239 [GRADIENT NORM TOTAL] 0.0743 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.165 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52507335 0.47492662] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 106/1942 | C: 245/1803 [LOSS Ex1] A: 0.68993 | B: 0.69359 | C: 0.68994 [LOGITS Ex2 A] Mean Abs: 0.059 | Max: 0.204 [LOSS Ex2] A: 0.67715 | B: 0.68473 | C: 0.68562 ** [JOINT LOSS] ** : 1.373654 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000301 | Grad Max: 0.004100 -> Layer: shared_layers.0.bias | Grad Mean: 0.001447 | Grad Max: 0.006355 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001760 | Grad Max: 0.010040 -> Layer: exit1_layers.0.bias | Grad Mean: 0.024106 | Grad Max: 0.024106 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000011 | Grad Max: 0.000269 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000101 | Grad Max: 0.001189 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000004 | Grad Max: 0.000170 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000067 | Grad Max: 0.000586 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000003 | Grad Max: 0.000093 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000072 | Grad Max: 0.000492 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000131 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000137 | Grad Max: 0.001063 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000341 | Grad Max: 0.001677 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003772 | Grad Max: 0.003772 [GRADIENT NORM TOTAL] 0.0668 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.140 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5145294 0.48547053] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.005 [MASKS] A(Pass/Fail): 2/2046 | B: 107/1941 | C: 180/1196 [LOSS Ex1] A: 0.68956 | B: 0.69351 | C: 0.68727 [LOGITS Ex2 A] Mean Abs: 0.068 | Max: 0.223 [LOSS Ex2] A: 0.67092 | B: 0.68268 | C: 0.67964 ** [JOINT LOSS] ** : 1.367858 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000329 | Grad Max: 0.004891 -> Layer: shared_layers.0.bias | Grad Mean: 0.001331 | Grad Max: 0.008628 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001793 | Grad Max: 0.010370 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013410 | Grad Max: 0.013410 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000014 | Grad Max: 0.000331 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000138 | Grad Max: 0.001324 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000005 | Grad Max: 0.000254 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000094 | Grad Max: 0.000703 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000003 | Grad Max: 0.000130 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000106 | Grad Max: 0.000687 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000168 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000190 | Grad Max: 0.001495 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000509 | Grad Max: 0.002465 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009612 | Grad Max: 0.009612 [GRADIENT NORM TOTAL] 0.0718 [EPOCH SUMMARY] Train Loss: 1.3137 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.3346 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: inf -> New: 1.3346) ############################## EPOCH 2/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.143 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5113414 0.4886586] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 86/1962 | C: 237/1811 [LOSS Ex1] A: 0.69218 | B: 0.69230 | C: 0.69106 [LOGITS Ex2 A] Mean Abs: 0.084 | Max: 0.262 [LOSS Ex2] A: 0.66491 | B: 0.68147 | C: 0.68477 ** [JOINT LOSS] ** : 1.368896 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000359 | Grad Max: 0.004874 -> Layer: shared_layers.0.bias | Grad Mean: 0.002497 | Grad Max: 0.011833 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002229 | Grad Max: 0.011853 -> Layer: exit1_layers.0.bias | Grad Mean: 0.036587 | Grad Max: 0.036587 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000018 | Grad Max: 0.000599 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000285 | Grad Max: 0.002927 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000008 | Grad Max: 0.000299 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000215 | Grad Max: 0.001339 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000005 | Grad Max: 0.000181 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000209 | Grad Max: 0.001226 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000249 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000319 | Grad Max: 0.002921 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000410 | Grad Max: 0.002644 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020730 | Grad Max: 0.020730 [GRADIENT NORM TOTAL] 0.1048 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.172 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5200744 0.47992554] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 95/1761 | C: 260/1788 [LOSS Ex1] A: 0.69133 | B: 0.69363 | C: 0.68700 [LOGITS Ex2 A] Mean Abs: 0.095 | Max: 0.309 [LOSS Ex2] A: 0.66574 | B: 0.67965 | C: 0.67877 ** [JOINT LOSS] ** : 1.365372 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000357 | Grad Max: 0.005013 -> Layer: shared_layers.0.bias | Grad Mean: 0.001934 | Grad Max: 0.008305 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001777 | Grad Max: 0.010390 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022877 | Grad Max: 0.022877 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000017 | Grad Max: 0.000538 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000197 | Grad Max: 0.002222 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000006 | Grad Max: 0.000249 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000122 | Grad Max: 0.001103 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000003 | Grad Max: 0.000140 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000116 | Grad Max: 0.000784 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000181 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000191 | Grad Max: 0.002018 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000485 | Grad Max: 0.002031 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006191 | Grad Max: 0.006191 [GRADIENT NORM TOTAL] 0.0786 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.095 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078551 0.49214488] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 106/1942 | C: 219/1829 [LOSS Ex1] A: 0.00000 | B: 0.69333 | C: 0.69038 [LOGITS Ex2 A] Mean Abs: 0.108 | Max: 0.354 [LOSS Ex2] A: 0.66084 | B: 0.67535 | C: 0.67993 ** [JOINT LOSS] ** : 1.133273 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000326 | Grad Max: 0.005157 -> Layer: shared_layers.0.bias | Grad Mean: 0.001947 | Grad Max: 0.009096 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001168 | Grad Max: 0.004467 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022083 | Grad Max: 0.022083 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000019 | Grad Max: 0.000615 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000174 | Grad Max: 0.002597 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000006 | Grad Max: 0.000207 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000093 | Grad Max: 0.000699 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000003 | Grad Max: 0.000103 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000094 | Grad Max: 0.000647 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000192 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000178 | Grad Max: 0.001619 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000550 | Grad Max: 0.002265 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004386 | Grad Max: 0.004386 [GRADIENT NORM TOTAL] 0.0739 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.106 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51048183 0.48951814] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.006 [MASKS] A(Pass/Fail): 0/1616 | B: 106/1942 | C: 240/1808 [LOSS Ex1] A: 0.00000 | B: 0.69326 | C: 0.68914 [LOGITS Ex2 A] Mean Abs: 0.129 | Max: 0.422 [LOSS Ex2] A: 0.65776 | B: 0.67115 | C: 0.67434 ** [JOINT LOSS] ** : 1.128550 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000388 | Grad Max: 0.005712 -> Layer: shared_layers.0.bias | Grad Mean: 0.001840 | Grad Max: 0.013325 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001052 | Grad Max: 0.004318 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018871 | Grad Max: 0.018871 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000023 | Grad Max: 0.000610 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000260 | Grad Max: 0.002804 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000008 | Grad Max: 0.000307 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000155 | Grad Max: 0.001229 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000004 | Grad Max: 0.000132 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000134 | Grad Max: 0.000813 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000201 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000186 | Grad Max: 0.001409 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000680 | Grad Max: 0.002762 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009435 | Grad Max: 0.009435 [GRADIENT NORM TOTAL] 0.0854 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.154 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5230074 0.47699255] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 86/1962 | C: 239/1809 [LOSS Ex1] A: 0.69094 | B: 0.69204 | C: 0.68893 [LOGITS Ex2 A] Mean Abs: 0.149 | Max: 0.474 [LOSS Ex2] A: 0.65880 | B: 0.66525 | C: 0.67321 ** [JOINT LOSS] ** : 1.356386 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000487 | Grad Max: 0.008658 -> Layer: shared_layers.0.bias | Grad Mean: 0.002279 | Grad Max: 0.018578 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001898 | Grad Max: 0.010941 -> Layer: exit1_layers.0.bias | Grad Mean: 0.024539 | Grad Max: 0.024539 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000026 | Grad Max: 0.000699 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000266 | Grad Max: 0.003601 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000008 | Grad Max: 0.000293 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000144 | Grad Max: 0.001029 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000004 | Grad Max: 0.000134 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000116 | Grad Max: 0.000730 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000189 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000165 | Grad Max: 0.001357 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000665 | Grad Max: 0.002914 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008034 | Grad Max: 0.008034 [GRADIENT NORM TOTAL] 0.0979 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.155 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51590544 0.48409456] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 95/1761 | C: 241/1807 [LOSS Ex1] A: 0.00000 | B: 0.69338 | C: 0.68797 [LOGITS Ex2 A] Mean Abs: 0.162 | Max: 0.555 [LOSS Ex2] A: 0.64760 | B: 0.66422 | C: 0.66446 ** [JOINT LOSS] ** : 1.119212 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000468 | Grad Max: 0.007503 -> Layer: shared_layers.0.bias | Grad Mean: 0.002450 | Grad Max: 0.014209 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001095 | Grad Max: 0.004676 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019605 | Grad Max: 0.019605 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000028 | Grad Max: 0.000889 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000253 | Grad Max: 0.003051 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000008 | Grad Max: 0.000294 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000122 | Grad Max: 0.000881 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000004 | Grad Max: 0.000120 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000100 | Grad Max: 0.000659 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000159 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000168 | Grad Max: 0.001036 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000786 | Grad Max: 0.002942 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001387 | Grad Max: 0.001387 [GRADIENT NORM TOTAL] 0.0925 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.177 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5224882 0.47751182] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 105/1943 | C: 251/1797 [LOSS Ex1] A: 0.69011 | B: 0.69308 | C: 0.68847 [LOGITS Ex2 A] Mean Abs: 0.179 | Max: 0.602 [LOSS Ex2] A: 0.64161 | B: 0.66106 | C: 0.66357 ** [JOINT LOSS] ** : 1.345968 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000564 | Grad Max: 0.010050 -> Layer: shared_layers.0.bias | Grad Mean: 0.002798 | Grad Max: 0.023381 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.011440 -> Layer: exit1_layers.0.bias | Grad Mean: 0.027074 | Grad Max: 0.027074 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000031 | Grad Max: 0.001052 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000268 | Grad Max: 0.003639 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000008 | Grad Max: 0.000250 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000123 | Grad Max: 0.000845 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000004 | Grad Max: 0.000139 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000094 | Grad Max: 0.000635 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000148 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000155 | Grad Max: 0.001062 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000801 | Grad Max: 0.002699 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000806 | Grad Max: 0.000806 [GRADIENT NORM TOTAL] 0.1087 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.165 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5234131 0.47658685] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 103/1945 | C: 232/1816 [LOSS Ex1] A: 0.68935 | B: 0.69303 | C: 0.68768 [LOGITS Ex2 A] Mean Abs: 0.195 | Max: 0.703 [LOSS Ex2] A: 0.65306 | B: 0.65702 | C: 0.65335 ** [JOINT LOSS] ** : 1.344498 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000653 | Grad Max: 0.011215 -> Layer: shared_layers.0.bias | Grad Mean: 0.002823 | Grad Max: 0.027862 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001713 | Grad Max: 0.010119 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017481 | Grad Max: 0.017481 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000039 | Grad Max: 0.001102 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000337 | Grad Max: 0.004805 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000010 | Grad Max: 0.000416 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000165 | Grad Max: 0.001052 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000005 | Grad Max: 0.000135 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000108 | Grad Max: 0.000777 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000165 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000138 | Grad Max: 0.000962 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000747 | Grad Max: 0.003018 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008489 | Grad Max: 0.008489 [GRADIENT NORM TOTAL] 0.1152 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.139 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5125335 0.4874665] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 82/1966 | C: 245/1803 [LOSS Ex1] A: 0.00000 | B: 0.69179 | C: 0.68820 [LOGITS Ex2 A] Mean Abs: 0.214 | Max: 0.848 [LOSS Ex2] A: 0.64094 | B: 0.64954 | C: 0.65725 ** [JOINT LOSS] ** : 1.109238 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000714 | Grad Max: 0.013491 -> Layer: shared_layers.0.bias | Grad Mean: 0.003627 | Grad Max: 0.027762 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001089 | Grad Max: 0.004856 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018156 | Grad Max: 0.018156 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000045 | Grad Max: 0.001489 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000366 | Grad Max: 0.006041 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000011 | Grad Max: 0.000327 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000151 | Grad Max: 0.000936 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000005 | Grad Max: 0.000121 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000096 | Grad Max: 0.000677 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000154 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000124 | Grad Max: 0.000873 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000747 | Grad Max: 0.002652 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000234 | Grad Max: 0.000234 [GRADIENT NORM TOTAL] 0.1209 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.146 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50782883 0.49217114] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 92/1764 | C: 218/1830 [LOSS Ex1] A: 0.69133 | B: 0.69314 | C: 0.69041 [LOGITS Ex2 A] Mean Abs: 0.249 | Max: 0.930 [LOSS Ex2] A: 0.63653 | B: 0.64502 | C: 0.65675 ** [JOINT LOSS] ** : 1.337726 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000754 | Grad Max: 0.013033 -> Layer: shared_layers.0.bias | Grad Mean: 0.006699 | Grad Max: 0.038463 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002228 | Grad Max: 0.011912 -> Layer: exit1_layers.0.bias | Grad Mean: 0.035176 | Grad Max: 0.035176 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000065 | Grad Max: 0.002250 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000875 | Grad Max: 0.007947 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000021 | Grad Max: 0.000736 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000520 | Grad Max: 0.002573 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000263 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000320 | Grad Max: 0.001422 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000265 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000309 | Grad Max: 0.001756 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000641 | Grad Max: 0.003498 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015760 | Grad Max: 0.015760 [GRADIENT NORM TOTAL] 0.1811 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.172 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51857543 0.48142457] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 104/1944 | C: 220/1828 [LOSS Ex1] A: 0.69071 | B: 0.69284 | C: 0.68744 [LOGITS Ex2 A] Mean Abs: 0.260 | Max: 0.955 [LOSS Ex2] A: 0.64911 | B: 0.63519 | C: 0.64405 ** [JOINT LOSS] ** : 1.333116 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000956 | Grad Max: 0.023314 -> Layer: shared_layers.0.bias | Grad Mean: 0.006703 | Grad Max: 0.046837 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001776 | Grad Max: 0.010218 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022812 | Grad Max: 0.022812 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000070 | Grad Max: 0.002128 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000985 | Grad Max: 0.006912 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000023 | Grad Max: 0.000883 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000614 | Grad Max: 0.003363 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000271 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000370 | Grad Max: 0.001557 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000329 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000345 | Grad Max: 0.001766 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001012 | Grad Max: 0.005338 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021675 | Grad Max: 0.021675 [GRADIENT NORM TOTAL] 0.2067 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.093 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50461024 0.4953898 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 102/1946 | C: 213/1835 [LOSS Ex1] A: 0.00000 | B: 0.69281 | C: 0.68740 [LOGITS Ex2 A] Mean Abs: 0.271 | Max: 1.035 [LOSS Ex2] A: 0.63825 | B: 0.62868 | C: 0.64292 ** [JOINT LOSS] ** : 1.096682 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000990 | Grad Max: 0.026298 -> Layer: shared_layers.0.bias | Grad Mean: 0.007545 | Grad Max: 0.046133 -> Layer: exit1_layers.0.weight | Grad Mean: 0.000968 | Grad Max: 0.004414 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014022 | Grad Max: 0.014022 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000074 | Grad Max: 0.002278 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001077 | Grad Max: 0.008500 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000024 | Grad Max: 0.000744 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000642 | Grad Max: 0.002937 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000268 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000367 | Grad Max: 0.001512 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000307 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000331 | Grad Max: 0.001571 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001000 | Grad Max: 0.004966 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019862 | Grad Max: 0.019862 [GRADIENT NORM TOTAL] 0.2134 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.105 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082936 0.49170634] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.006 [MASKS] A(Pass/Fail): 0/1616 | B: 82/1966 | C: 231/1817 [LOSS Ex1] A: 0.00000 | B: 0.69156 | C: 0.68627 [LOGITS Ex2 A] Mean Abs: 0.285 | Max: 1.057 [LOSS Ex2] A: 0.61686 | B: 0.63328 | C: 0.63068 ** [JOINT LOSS] ** : 1.086216 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000937 | Grad Max: 0.019605 -> Layer: shared_layers.0.bias | Grad Mean: 0.008149 | Grad Max: 0.049096 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001001 | Grad Max: 0.004859 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011017 | Grad Max: 0.011017 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000080 | Grad Max: 0.002448 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001061 | Grad Max: 0.010500 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000024 | Grad Max: 0.000803 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000580 | Grad Max: 0.002843 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000267 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000323 | Grad Max: 0.001212 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000247 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000281 | Grad Max: 0.001364 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000852 | Grad Max: 0.003998 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012960 | Grad Max: 0.012960 [GRADIENT NORM TOTAL] 0.2015 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.158 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5215471 0.4784529] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.006 [MASKS] A(Pass/Fail): 2/2046 | B: 91/1765 | C: 176/1200 [LOSS Ex1] A: 0.69033 | B: 0.69291 | C: 0.68661 [LOGITS Ex2 A] Mean Abs: 0.294 | Max: 1.129 [LOSS Ex2] A: 0.61353 | B: 0.63024 | C: 0.63309 ** [JOINT LOSS] ** : 1.315575 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.000990 | Grad Max: 0.017834 -> Layer: shared_layers.0.bias | Grad Mean: 0.013137 | Grad Max: 0.071255 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001915 | Grad Max: 0.011159 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022340 | Grad Max: 0.022340 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000123 | Grad Max: 0.003386 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002015 | Grad Max: 0.014653 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.001133 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001244 | Grad Max: 0.005344 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000428 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000682 | Grad Max: 0.002480 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000456 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000581 | Grad Max: 0.002241 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001536 | Grad Max: 0.005940 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030050 | Grad Max: 0.030050 [GRADIENT NORM TOTAL] 0.3268 [EPOCH SUMMARY] Train Loss: 1.2458 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.2776 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.3346 -> New: 1.2776) ############################## EPOCH 3/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.158 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5145997 0.48540035] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 104/1944 | C: 218/1830 [LOSS Ex1] A: 0.00000 | B: 0.69263 | C: 0.68681 [LOGITS Ex2 A] Mean Abs: 0.303 | Max: 1.372 [LOSS Ex2] A: 0.61768 | B: 0.61410 | C: 0.63222 ** [JOINT LOSS] ** : 1.081145 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001117 | Grad Max: 0.027060 -> Layer: shared_layers.0.bias | Grad Mean: 0.008944 | Grad Max: 0.043180 -> Layer: exit1_layers.0.weight | Grad Mean: 0.000986 | Grad Max: 0.004560 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014544 | Grad Max: 0.014544 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000086 | Grad Max: 0.002419 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001179 | Grad Max: 0.007743 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000024 | Grad Max: 0.000815 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000622 | Grad Max: 0.003297 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000223 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000317 | Grad Max: 0.001322 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000249 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000256 | Grad Max: 0.001203 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001030 | Grad Max: 0.005488 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013948 | Grad Max: 0.013948 [GRADIENT NORM TOTAL] 0.2223 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.178 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5210887 0.47891128] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 102/1946 | C: 225/1823 [LOSS Ex1] A: 0.68955 | B: 0.69260 | C: 0.68444 [LOGITS Ex2 A] Mean Abs: 0.315 | Max: 1.297 [LOSS Ex2] A: 0.61685 | B: 0.60470 | C: 0.62445 ** [JOINT LOSS] ** : 1.304197 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001258 | Grad Max: 0.029777 -> Layer: shared_layers.0.bias | Grad Mean: 0.020291 | Grad Max: 0.091638 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001804 | Grad Max: 0.010589 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014122 | Grad Max: 0.014122 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000176 | Grad Max: 0.003858 -> Layer: exit2_layers.0.bias | Grad Mean: 0.003068 | Grad Max: 0.018550 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000064 | Grad Max: 0.002007 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001877 | Grad Max: 0.008301 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000645 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000975 | Grad Max: 0.003402 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000558 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000779 | Grad Max: 0.002754 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002554 | Grad Max: 0.010251 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041153 | Grad Max: 0.041153 [GRADIENT NORM TOTAL] 0.4838 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.165 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52205557 0.47794446] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 81/1967 | C: 240/1808 [LOSS Ex1] A: 0.68900 | B: 0.69134 | C: 0.68692 [LOGITS Ex2 A] Mean Abs: 0.293 | Max: 1.321 [LOSS Ex2] A: 0.61256 | B: 0.60469 | C: 0.61878 ** [JOINT LOSS] ** : 1.301098 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001210 | Grad Max: 0.022391 -> Layer: shared_layers.0.bias | Grad Mean: 0.007742 | Grad Max: 0.056442 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001625 | Grad Max: 0.009413 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013874 | Grad Max: 0.013874 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000090 | Grad Max: 0.002347 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000993 | Grad Max: 0.011694 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000023 | Grad Max: 0.000746 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000519 | Grad Max: 0.002536 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000296 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000262 | Grad Max: 0.001324 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000282 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000208 | Grad Max: 0.001215 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001057 | Grad Max: 0.004575 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009451 | Grad Max: 0.009451 [GRADIENT NORM TOTAL] 0.2120 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.139 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51104176 0.48895824] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 91/1765 | C: 234/1814 [LOSS Ex1] A: 0.00000 | B: 0.69271 | C: 0.68861 [LOGITS Ex2 A] Mean Abs: 0.306 | Max: 1.339 [LOSS Ex2] A: 0.60089 | B: 0.61249 | C: 0.62518 ** [JOINT LOSS] ** : 1.073291 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001269 | Grad Max: 0.024607 -> Layer: shared_layers.0.bias | Grad Mean: 0.024117 | Grad Max: 0.112984 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001250 | Grad Max: 0.004946 -> Layer: exit1_layers.0.bias | Grad Mean: 0.023051 | Grad Max: 0.023051 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000229 | Grad Max: 0.005544 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004116 | Grad Max: 0.026181 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.002230 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002505 | Grad Max: 0.010020 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000732 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001258 | Grad Max: 0.003782 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000835 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000966 | Grad Max: 0.003472 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003207 | Grad Max: 0.009636 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048195 | Grad Max: 0.048195 [GRADIENT NORM TOTAL] 0.6019 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.148 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50557864 0.49442136] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 104/1944 | C: 219/1829 [LOSS Ex1] A: 0.69081 | B: 0.69243 | C: 0.68907 [LOGITS Ex2 A] Mean Abs: 0.314 | Max: 1.458 [LOSS Ex2] A: 0.60321 | B: 0.60062 | C: 0.61624 ** [JOINT LOSS] ** : 1.297456 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001308 | Grad Max: 0.017819 -> Layer: shared_layers.0.bias | Grad Mean: 0.005287 | Grad Max: 0.026603 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.011482 -> Layer: exit1_layers.0.bias | Grad Mean: 0.031404 | Grad Max: 0.031404 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000078 | Grad Max: 0.002195 -> Layer: exit2_layers.0.bias | Grad Mean: 0.000657 | Grad Max: 0.011530 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000017 | Grad Max: 0.000647 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000214 | Grad Max: 0.001668 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000222 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000102 | Grad Max: 0.000801 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000222 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000082 | Grad Max: 0.000702 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001070 | Grad Max: 0.003596 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003087 | Grad Max: 0.003087 [GRADIENT NORM TOTAL] 0.1833 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.173 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51748586 0.4825141 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 102/1946 | C: 241/1807 [LOSS Ex1] A: 0.69033 | B: 0.69241 | C: 0.68662 [LOGITS Ex2 A] Mean Abs: 0.340 | Max: 1.549 [LOSS Ex2] A: 0.62305 | B: 0.58884 | C: 0.61141 ** [JOINT LOSS] ** : 1.297554 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001718 | Grad Max: 0.044808 -> Layer: shared_layers.0.bias | Grad Mean: 0.037290 | Grad Max: 0.163126 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001766 | Grad Max: 0.010041 -> Layer: exit1_layers.0.bias | Grad Mean: 0.020341 | Grad Max: 0.020341 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000326 | Grad Max: 0.006887 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005811 | Grad Max: 0.036221 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.002961 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003484 | Grad Max: 0.013855 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.001001 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001671 | Grad Max: 0.005293 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000056 | Grad Max: 0.000973 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001223 | Grad Max: 0.004492 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004451 | Grad Max: 0.015220 -> Layer: exit2_layers.12.bias | Grad Mean: 0.061124 | Grad Max: 0.061124 [GRADIENT NORM TOTAL] 0.8594 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.091 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025032 0.4974968] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 81/1967 | C: 231/1817 [LOSS Ex1] A: 0.00000 | B: 0.69114 | C: 0.68912 [LOGITS Ex2 A] Mean Abs: 0.312 | Max: 1.625 [LOSS Ex2] A: 0.59467 | B: 0.58459 | C: 0.59669 ** [JOINT LOSS] ** : 1.052070 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001361 | Grad Max: 0.022746 -> Layer: shared_layers.0.bias | Grad Mean: 0.012618 | Grad Max: 0.055932 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001151 | Grad Max: 0.004817 -> Layer: exit1_layers.0.bias | Grad Mean: 0.020524 | Grad Max: 0.020524 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000122 | Grad Max: 0.003681 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001762 | Grad Max: 0.014622 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000034 | Grad Max: 0.000985 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000902 | Grad Max: 0.004056 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000345 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000420 | Grad Max: 0.001755 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000312 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000297 | Grad Max: 0.001273 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001474 | Grad Max: 0.006018 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014891 | Grad Max: 0.014891 [GRADIENT NORM TOTAL] 0.2978 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.105 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070116 0.49298844] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.006 [MASKS] A(Pass/Fail): 0/1616 | B: 90/1766 | C: 234/1814 [LOSS Ex1] A: 0.00000 | B: 0.69251 | C: 0.68761 [LOGITS Ex2 A] Mean Abs: 0.327 | Max: 1.453 [LOSS Ex2] A: 0.58418 | B: 0.58977 | C: 0.60109 ** [JOINT LOSS] ** : 1.051720 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001488 | Grad Max: 0.026754 -> Layer: shared_layers.0.bias | Grad Mean: 0.024055 | Grad Max: 0.116033 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001081 | Grad Max: 0.004577 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018517 | Grad Max: 0.018517 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000230 | Grad Max: 0.006011 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004226 | Grad Max: 0.025914 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000085 | Grad Max: 0.002168 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002470 | Grad Max: 0.010328 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000794 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001151 | Grad Max: 0.004022 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000774 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000819 | Grad Max: 0.003145 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002899 | Grad Max: 0.009886 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039225 | Grad Max: 0.039225 [GRADIENT NORM TOTAL] 0.5902 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.161 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5204957 0.4795043] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 104/1944 | C: 220/1828 [LOSS Ex1] A: 0.68993 | B: 0.69223 | C: 0.68774 [LOGITS Ex2 A] Mean Abs: 0.333 | Max: 1.633 [LOSS Ex2] A: 0.58853 | B: 0.59361 | C: 0.60708 ** [JOINT LOSS] ** : 1.286374 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001661 | Grad Max: 0.034253 -> Layer: shared_layers.0.bias | Grad Mean: 0.024894 | Grad Max: 0.122559 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001852 | Grad Max: 0.010724 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022579 | Grad Max: 0.022579 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000243 | Grad Max: 0.006546 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004403 | Grad Max: 0.029231 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000089 | Grad Max: 0.002252 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002558 | Grad Max: 0.010325 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000722 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001176 | Grad Max: 0.003667 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000733 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000827 | Grad Max: 0.002918 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003056 | Grad Max: 0.009499 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039476 | Grad Max: 0.039476 [GRADIENT NORM TOTAL] 0.6137 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.160 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51377565 0.48622432] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 100/1948 | C: 230/1818 [LOSS Ex1] A: 0.00000 | B: 0.69222 | C: 0.68658 [LOGITS Ex2 A] Mean Abs: 0.334 | Max: 1.700 [LOSS Ex2] A: 0.58790 | B: 0.57700 | C: 0.58962 ** [JOINT LOSS] ** : 1.044441 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001341 | Grad Max: 0.025257 -> Layer: shared_layers.0.bias | Grad Mean: 0.020703 | Grad Max: 0.094050 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001009 | Grad Max: 0.004691 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012883 | Grad Max: 0.012883 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000185 | Grad Max: 0.004520 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002999 | Grad Max: 0.020306 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.001471 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001586 | Grad Max: 0.006720 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000398 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000707 | Grad Max: 0.002312 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000435 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000479 | Grad Max: 0.001677 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001858 | Grad Max: 0.007260 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023408 | Grad Max: 0.023408 [GRADIENT NORM TOTAL] 0.4471 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.179 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52012324 0.4798768 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.007 [MASKS] A(Pass/Fail): 2/2046 | B: 80/1968 | C: 220/1828 [LOSS Ex1] A: 0.68915 | B: 0.69094 | C: 0.68885 [LOGITS Ex2 A] Mean Abs: 0.347 | Max: 1.712 [LOSS Ex2] A: 0.59167 | B: 0.57734 | C: 0.59006 ** [JOINT LOSS] ** : 1.276003 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001523 | Grad Max: 0.027598 -> Layer: shared_layers.0.bias | Grad Mean: 0.030497 | Grad Max: 0.134791 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002037 | Grad Max: 0.011451 -> Layer: exit1_layers.0.bias | Grad Mean: 0.025775 | Grad Max: 0.025775 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000270 | Grad Max: 0.006310 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004598 | Grad Max: 0.030058 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.002004 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002561 | Grad Max: 0.008877 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000598 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001126 | Grad Max: 0.003385 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000632 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000756 | Grad Max: 0.002537 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002961 | Grad Max: 0.010046 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036543 | Grad Max: 0.036543 [GRADIENT NORM TOTAL] 0.6644 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.165 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5210129 0.47898713] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 90/1766 | C: 250/1798 [LOSS Ex1] A: 0.68873 | B: 0.69232 | C: 0.68858 [LOGITS Ex2 A] Mean Abs: 0.338 | Max: 1.751 [LOSS Ex2] A: 0.58697 | B: 0.56462 | C: 0.56497 ** [JOINT LOSS] ** : 1.262064 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001372 | Grad Max: 0.025513 -> Layer: shared_layers.0.bias | Grad Mean: 0.008341 | Grad Max: 0.036859 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001845 | Grad Max: 0.010371 -> Layer: exit1_layers.0.bias | Grad Mean: 0.023509 | Grad Max: 0.023509 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000096 | Grad Max: 0.003008 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001055 | Grad Max: 0.011698 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000022 | Grad Max: 0.000734 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000441 | Grad Max: 0.002556 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000215 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000189 | Grad Max: 0.001069 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000270 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000125 | Grad Max: 0.001113 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001474 | Grad Max: 0.004824 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006138 | Grad Max: 0.006138 [GRADIENT NORM TOTAL] 0.2260 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.138 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50984466 0.49015537] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 101/1947 | C: 250/1798 [LOSS Ex1] A: 0.00000 | B: 0.69204 | C: 0.68603 [LOGITS Ex2 A] Mean Abs: 0.347 | Max: 1.791 [LOSS Ex2] A: 0.57763 | B: 0.57639 | C: 0.58617 ** [JOINT LOSS] ** : 1.039423 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001509 | Grad Max: 0.031986 -> Layer: shared_layers.0.bias | Grad Mean: 0.019508 | Grad Max: 0.100065 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001028 | Grad Max: 0.004720 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013638 | Grad Max: 0.013638 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000198 | Grad Max: 0.005656 -> Layer: exit2_layers.0.bias | Grad Mean: 0.003623 | Grad Max: 0.023917 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000069 | Grad Max: 0.001876 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001999 | Grad Max: 0.007842 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000548 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000871 | Grad Max: 0.002663 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000491 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000578 | Grad Max: 0.002022 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002141 | Grad Max: 0.006724 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026595 | Grad Max: 0.026595 [GRADIENT NORM TOTAL] 0.4825 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.150 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5036451 0.4963549] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 83/1965 | C: 158/1218 [LOSS Ex1] A: 0.69035 | B: 0.69204 | C: 0.68829 [LOGITS Ex2 A] Mean Abs: 0.371 | Max: 1.885 [LOSS Ex2] A: 0.57115 | B: 0.57244 | C: 0.59703 ** [JOINT LOSS] ** : 1.270436 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001811 | Grad Max: 0.033460 -> Layer: shared_layers.0.bias | Grad Mean: 0.029756 | Grad Max: 0.151382 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.011944 -> Layer: exit1_layers.0.bias | Grad Mean: 0.030981 | Grad Max: 0.030981 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000278 | Grad Max: 0.007220 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005299 | Grad Max: 0.031937 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000099 | Grad Max: 0.002684 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002945 | Grad Max: 0.010801 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000687 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001249 | Grad Max: 0.003735 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000673 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000807 | Grad Max: 0.002920 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003074 | Grad Max: 0.009367 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036744 | Grad Max: 0.036744 [GRADIENT NORM TOTAL] 0.6960 [EPOCH SUMMARY] Train Loss: 1.1884 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.1823 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.2776 -> New: 1.1823) ############################## EPOCH 4/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.173 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51658 0.48341998] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.007 [MASKS] A(Pass/Fail): 3/2045 | B: 71/1977 | C: 235/1813 [LOSS Ex1] A: 0.68998 | B: 0.69075 | C: 0.68936 [LOGITS Ex2 A] Mean Abs: 0.385 | Max: 1.946 [LOSS Ex2] A: 0.58553 | B: 0.56031 | C: 0.56623 ** [JOINT LOSS] ** : 1.260717 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001726 | Grad Max: 0.031083 -> Layer: shared_layers.0.bias | Grad Mean: 0.027957 | Grad Max: 0.127689 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001953 | Grad Max: 0.010791 -> Layer: exit1_layers.0.bias | Grad Mean: 0.027551 | Grad Max: 0.027551 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000277 | Grad Max: 0.006692 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004569 | Grad Max: 0.033027 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.002430 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002521 | Grad Max: 0.010631 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000661 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001054 | Grad Max: 0.003118 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000552 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000665 | Grad Max: 0.002291 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002772 | Grad Max: 0.009569 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031114 | Grad Max: 0.031114 [GRADIENT NORM TOTAL] 0.6376 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.090 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50045496 0.49954507] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 83/1773 | C: 236/1812 [LOSS Ex1] A: 0.00000 | B: 0.69213 | C: 0.68762 [LOGITS Ex2 A] Mean Abs: 0.404 | Max: 2.173 [LOSS Ex2] A: 0.57409 | B: 0.55181 | C: 0.56259 ** [JOINT LOSS] ** : 1.022748 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001769 | Grad Max: 0.038191 -> Layer: shared_layers.0.bias | Grad Mean: 0.039509 | Grad Max: 0.184904 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001160 | Grad Max: 0.004580 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019888 | Grad Max: 0.019888 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000365 | Grad Max: 0.008395 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006438 | Grad Max: 0.044898 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000121 | Grad Max: 0.002795 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003555 | Grad Max: 0.013272 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000788 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001469 | Grad Max: 0.004225 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000802 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000916 | Grad Max: 0.003244 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003751 | Grad Max: 0.011937 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042486 | Grad Max: 0.042486 [GRADIENT NORM TOTAL] 0.8800 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.104 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.505698 0.49430197] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.006 [MASKS] A(Pass/Fail): 0/1616 | B: 87/1961 | C: 201/1847 [LOSS Ex1] A: 0.00000 | B: 0.69186 | C: 0.68813 [LOGITS Ex2 A] Mean Abs: 0.418 | Max: 1.984 [LOSS Ex2] A: 0.55556 | B: 0.56915 | C: 0.55717 ** [JOINT LOSS] ** : 1.020622 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001315 | Grad Max: 0.023442 -> Layer: shared_layers.0.bias | Grad Mean: 0.015611 | Grad Max: 0.080299 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001144 | Grad Max: 0.004641 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018053 | Grad Max: 0.018053 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000142 | Grad Max: 0.006183 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002706 | Grad Max: 0.033417 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.001814 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001325 | Grad Max: 0.007845 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000348 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000535 | Grad Max: 0.001862 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000449 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000329 | Grad Max: 0.001518 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001261 | Grad Max: 0.005177 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014203 | Grad Max: 0.014203 [GRADIENT NORM TOTAL] 0.3564 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.164 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5194731 0.4805269] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 81/1967 | C: 216/1832 [LOSS Ex1] A: 0.68950 | B: 0.69187 | C: 0.68695 [LOGITS Ex2 A] Mean Abs: 0.440 | Max: 2.143 [LOSS Ex2] A: 0.56528 | B: 0.56719 | C: 0.57345 ** [JOINT LOSS] ** : 1.258078 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001924 | Grad Max: 0.036528 -> Layer: shared_layers.0.bias | Grad Mean: 0.056701 | Grad Max: 0.273019 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001859 | Grad Max: 0.010231 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019777 | Grad Max: 0.019777 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000483 | Grad Max: 0.011386 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009224 | Grad Max: 0.054606 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000164 | Grad Max: 0.003693 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004907 | Grad Max: 0.018130 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.001111 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001970 | Grad Max: 0.005466 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000060 | Grad Max: 0.001019 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001199 | Grad Max: 0.004081 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004711 | Grad Max: 0.012029 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053672 | Grad Max: 0.053672 [GRADIENT NORM TOTAL] 1.1941 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.162 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51299936 0.48700064] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 59/1989 | C: 243/1805 [LOSS Ex1] A: 0.00000 | B: 0.69057 | C: 0.68517 [LOGITS Ex2 A] Mean Abs: 0.437 | Max: 2.274 [LOSS Ex2] A: 0.54859 | B: 0.54767 | C: 0.55146 ** [JOINT LOSS] ** : 1.007817 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001277 | Grad Max: 0.023969 -> Layer: shared_layers.0.bias | Grad Mean: 0.012185 | Grad Max: 0.059310 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001004 | Grad Max: 0.005354 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010316 | Grad Max: 0.010316 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000113 | Grad Max: 0.006230 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002045 | Grad Max: 0.033078 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000032 | Grad Max: 0.001142 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000906 | Grad Max: 0.004793 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000268 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000356 | Grad Max: 0.001337 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000262 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000215 | Grad Max: 0.000992 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001023 | Grad Max: 0.003959 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008919 | Grad Max: 0.008919 [GRADIENT NORM TOTAL] 0.2772 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.178 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51910555 0.48089442] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.007 [MASKS] A(Pass/Fail): 2/2046 | B: 73/1783 | C: 200/1848 [LOSS Ex1] A: 0.68872 | B: 0.69195 | C: 0.68762 [LOGITS Ex2 A] Mean Abs: 0.454 | Max: 2.443 [LOSS Ex2] A: 0.56003 | B: 0.54669 | C: 0.57373 ** [JOINT LOSS] ** : 1.249581 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.038854 -> Layer: shared_layers.0.bias | Grad Mean: 0.053131 | Grad Max: 0.252947 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001988 | Grad Max: 0.011387 -> Layer: exit1_layers.0.bias | Grad Mean: 0.023867 | Grad Max: 0.023867 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000489 | Grad Max: 0.011179 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008584 | Grad Max: 0.059600 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000156 | Grad Max: 0.003560 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004578 | Grad Max: 0.016276 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000872 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001801 | Grad Max: 0.005114 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000055 | Grad Max: 0.000867 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001071 | Grad Max: 0.003381 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004479 | Grad Max: 0.012335 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048533 | Grad Max: 0.048533 [GRADIENT NORM TOTAL] 1.1424 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.164 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5199457 0.4800543] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 79/1969 | C: 207/1841 [LOSS Ex1] A: 0.68842 | B: 0.69167 | C: 0.68823 [LOGITS Ex2 A] Mean Abs: 0.450 | Max: 2.416 [LOSS Ex2] A: 0.56217 | B: 0.55174 | C: 0.58116 ** [JOINT LOSS] ** : 1.254463 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.041599 -> Layer: shared_layers.0.bias | Grad Mean: 0.040599 | Grad Max: 0.207744 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001660 | Grad Max: 0.009309 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015902 | Grad Max: 0.015902 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000388 | Grad Max: 0.009591 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006709 | Grad Max: 0.046680 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.003122 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003549 | Grad Max: 0.012660 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000798 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001389 | Grad Max: 0.004095 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000698 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000818 | Grad Max: 0.002705 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003449 | Grad Max: 0.009603 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036818 | Grad Max: 0.036818 [GRADIENT NORM TOTAL] 0.8925 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.137 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50873 0.49127] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 75/1973 | C: 206/1842 [LOSS Ex1] A: 0.00000 | B: 0.69170 | C: 0.68738 [LOGITS Ex2 A] Mean Abs: 0.462 | Max: 2.251 [LOSS Ex2] A: 0.55596 | B: 0.55642 | C: 0.55643 ** [JOINT LOSS] ** : 1.015962 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001726 | Grad Max: 0.034641 -> Layer: shared_layers.0.bias | Grad Mean: 0.056795 | Grad Max: 0.277652 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001094 | Grad Max: 0.004642 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017595 | Grad Max: 0.017595 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000475 | Grad Max: 0.011343 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009156 | Grad Max: 0.062887 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000156 | Grad Max: 0.003978 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004721 | Grad Max: 0.017240 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000917 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001816 | Grad Max: 0.005069 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000949 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001060 | Grad Max: 0.003521 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004225 | Grad Max: 0.010555 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046548 | Grad Max: 0.046548 [GRADIENT NORM TOTAL] 1.1636 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.151 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014873 0.49851266] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 58/1990 | C: 228/1820 [LOSS Ex1] A: 0.68987 | B: 0.69039 | C: 0.68709 [LOGITS Ex2 A] Mean Abs: 0.483 | Max: 2.233 [LOSS Ex2] A: 0.55310 | B: 0.55903 | C: 0.57008 ** [JOINT LOSS] ** : 1.249853 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001954 | Grad Max: 0.038801 -> Layer: shared_layers.0.bias | Grad Mean: 0.068409 | Grad Max: 0.330292 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.011074 -> Layer: exit1_layers.0.bias | Grad Mean: 0.027013 | Grad Max: 0.027013 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000565 | Grad Max: 0.014768 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010946 | Grad Max: 0.069838 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000186 | Grad Max: 0.004321 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005630 | Grad Max: 0.019441 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.001084 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002147 | Grad Max: 0.005904 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000062 | Grad Max: 0.000985 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001244 | Grad Max: 0.004048 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004986 | Grad Max: 0.013220 -> Layer: exit2_layers.12.bias | Grad Mean: 0.055020 | Grad Max: 0.055020 [GRADIENT NORM TOTAL] 1.3982 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.173 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5156842 0.4843158] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.007 [MASKS] A(Pass/Fail): 3/2045 | B: 73/1783 | C: 252/1796 [LOSS Ex1] A: 0.68961 | B: 0.69177 | C: 0.68292 [LOGITS Ex2 A] Mean Abs: 0.469 | Max: 2.476 [LOSS Ex2] A: 0.56204 | B: 0.53634 | C: 0.54646 ** [JOINT LOSS] ** : 1.236382 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001420 | Grad Max: 0.026983 -> Layer: shared_layers.0.bias | Grad Mean: 0.007225 | Grad Max: 0.040864 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001751 | Grad Max: 0.009898 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013926 | Grad Max: 0.013926 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000119 | Grad Max: 0.004526 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001314 | Grad Max: 0.024233 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000023 | Grad Max: 0.000909 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000395 | Grad Max: 0.002627 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000190 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000147 | Grad Max: 0.001023 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000201 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000082 | Grad Max: 0.000513 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000574 | Grad Max: 0.002821 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004037 | Grad Max: 0.004037 [GRADIENT NORM TOTAL] 0.2208 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.089 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016016 0.49839842] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 78/1970 | C: 204/1844 [LOSS Ex1] A: 0.00000 | B: 0.69150 | C: 0.68517 [LOGITS Ex2 A] Mean Abs: 0.494 | Max: 2.574 [LOSS Ex2] A: 0.55821 | B: 0.55745 | C: 0.54856 ** [JOINT LOSS] ** : 1.013628 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002707 | Grad Max: 0.063385 -> Layer: shared_layers.0.bias | Grad Mean: 0.070725 | Grad Max: 0.346324 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001035 | Grad Max: 0.004639 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012606 | Grad Max: 0.012606 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000658 | Grad Max: 0.016227 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011917 | Grad Max: 0.075540 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000208 | Grad Max: 0.004619 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006158 | Grad Max: 0.022710 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.001188 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002330 | Grad Max: 0.006415 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000068 | Grad Max: 0.001029 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001330 | Grad Max: 0.004100 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005558 | Grad Max: 0.014280 -> Layer: exit2_layers.12.bias | Grad Mean: 0.059058 | Grad Max: 0.059058 [GRADIENT NORM TOTAL] 1.5213 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.104 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50437933 0.49562064] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.007 [MASKS] A(Pass/Fail): 0/1616 | B: 72/1976 | C: 207/1841 [LOSS Ex1] A: 0.00000 | B: 0.69153 | C: 0.68761 [LOGITS Ex2 A] Mean Abs: 0.493 | Max: 2.470 [LOSS Ex2] A: 0.55602 | B: 0.54570 | C: 0.55668 ** [JOINT LOSS] ** : 1.012515 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001842 | Grad Max: 0.042785 -> Layer: shared_layers.0.bias | Grad Mean: 0.042327 | Grad Max: 0.215639 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001094 | Grad Max: 0.004675 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017769 | Grad Max: 0.017769 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000404 | Grad Max: 0.009749 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007079 | Grad Max: 0.048611 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.002653 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003604 | Grad Max: 0.013408 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000724 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001360 | Grad Max: 0.003946 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000722 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000771 | Grad Max: 0.002671 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003245 | Grad Max: 0.009958 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034374 | Grad Max: 0.034374 [GRADIENT NORM TOTAL] 0.9127 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.166 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5185403 0.48145968] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 57/1991 | C: 218/1830 [LOSS Ex1] A: 0.68909 | B: 0.69021 | C: 0.68662 [LOGITS Ex2 A] Mean Abs: 0.498 | Max: 2.422 [LOSS Ex2] A: 0.54563 | B: 0.55751 | C: 0.55299 ** [JOINT LOSS] ** : 1.240685 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002214 | Grad Max: 0.050774 -> Layer: shared_layers.0.bias | Grad Mean: 0.065611 | Grad Max: 0.318099 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001930 | Grad Max: 0.010998 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022991 | Grad Max: 0.022991 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000546 | Grad Max: 0.013489 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010600 | Grad Max: 0.065460 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000175 | Grad Max: 0.004082 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005303 | Grad Max: 0.018863 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000977 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001975 | Grad Max: 0.005399 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000056 | Grad Max: 0.000879 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001112 | Grad Max: 0.003492 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004442 | Grad Max: 0.011297 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048192 | Grad Max: 0.048192 [GRADIENT NORM TOTAL] 1.3286 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.164 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5123551 0.4876449] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.007 [MASKS] A(Pass/Fail): 0/2048 | B: 72/1784 | C: 154/1222 [LOSS Ex1] A: 0.00000 | B: 0.69161 | C: 0.68545 [LOGITS Ex2 A] Mean Abs: 0.509 | Max: 2.426 [LOSS Ex2] A: 0.53863 | B: 0.54422 | C: 0.53839 ** [JOINT LOSS] ** : 0.999431 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002632 | Grad Max: 0.064329 -> Layer: shared_layers.0.bias | Grad Mean: 0.073531 | Grad Max: 0.367397 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001128 | Grad Max: 0.005063 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016704 | Grad Max: 0.016704 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000651 | Grad Max: 0.016345 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012403 | Grad Max: 0.079356 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000207 | Grad Max: 0.004879 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006203 | Grad Max: 0.021911 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.001151 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002308 | Grad Max: 0.006545 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000065 | Grad Max: 0.001069 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001296 | Grad Max: 0.004093 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005263 | Grad Max: 0.012831 -> Layer: exit2_layers.12.bias | Grad Mean: 0.056272 | Grad Max: 0.056272 [GRADIENT NORM TOTAL] 1.5297 [EPOCH SUMMARY] Train Loss: 1.1316 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.1825 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 5/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.178 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51820445 0.48179555] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.007 [MASKS] A(Pass/Fail): 2/2046 | B: 76/1972 | C: 223/1825 [LOSS Ex1] A: 0.68830 | B: 0.69133 | C: 0.68625 [LOGITS Ex2 A] Mean Abs: 0.480 | Max: 2.493 [LOSS Ex2] A: 0.54048 | B: 0.54965 | C: 0.53131 ** [JOINT LOSS] ** : 1.229110 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001392 | Grad Max: 0.030426 -> Layer: shared_layers.0.bias | Grad Mean: 0.007379 | Grad Max: 0.053582 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002031 | Grad Max: 0.011351 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022244 | Grad Max: 0.022244 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000123 | Grad Max: 0.003881 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001480 | Grad Max: 0.019029 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000022 | Grad Max: 0.000885 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000345 | Grad Max: 0.003198 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000223 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000097 | Grad Max: 0.000804 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000170 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000050 | Grad Max: 0.000458 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000326 | Grad Max: 0.001987 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000926 | Grad Max: 0.000926 [GRADIENT NORM TOTAL] 0.2323 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.163 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51902765 0.48097238] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 70/1978 | C: 226/1822 [LOSS Ex1] A: 0.68813 | B: 0.69137 | C: 0.68396 [LOGITS Ex2 A] Mean Abs: 0.496 | Max: 2.740 [LOSS Ex2] A: 0.55436 | B: 0.54192 | C: 0.54009 ** [JOINT LOSS] ** : 1.233278 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002565 | Grad Max: 0.060624 -> Layer: shared_layers.0.bias | Grad Mean: 0.077463 | Grad Max: 0.380189 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001668 | Grad Max: 0.009089 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009470 | Grad Max: 0.009470 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000691 | Grad Max: 0.017956 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012475 | Grad Max: 0.091461 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000213 | Grad Max: 0.004879 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006355 | Grad Max: 0.021875 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.001146 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002343 | Grad Max: 0.006617 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000066 | Grad Max: 0.001005 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001309 | Grad Max: 0.004148 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005508 | Grad Max: 0.013076 -> Layer: exit2_layers.12.bias | Grad Mean: 0.058357 | Grad Max: 0.058357 [GRADIENT NORM TOTAL] 1.6027 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.136 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078479 0.4921521] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 58/1990 | C: 206/1842 [LOSS Ex1] A: 0.00000 | B: 0.69005 | C: 0.68721 [LOGITS Ex2 A] Mean Abs: 0.486 | Max: 2.577 [LOSS Ex2] A: 0.54754 | B: 0.53051 | C: 0.52151 ** [JOINT LOSS] ** : 0.992273 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001659 | Grad Max: 0.032489 -> Layer: shared_layers.0.bias | Grad Mean: 0.044435 | Grad Max: 0.223910 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001207 | Grad Max: 0.004964 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019592 | Grad Max: 0.019592 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000395 | Grad Max: 0.010101 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006964 | Grad Max: 0.048802 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000115 | Grad Max: 0.003084 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003460 | Grad Max: 0.013326 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000741 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001272 | Grad Max: 0.004118 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000665 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000703 | Grad Max: 0.002374 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002987 | Grad Max: 0.007795 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031431 | Grad Max: 0.031431 [GRADIENT NORM TOTAL] 0.9060 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.153 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005552 0.49944478] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 69/1787 | C: 199/1849 [LOSS Ex1] A: 0.68942 | B: 0.69145 | C: 0.68551 [LOGITS Ex2 A] Mean Abs: 0.508 | Max: 2.530 [LOSS Ex2] A: 0.53781 | B: 0.53537 | C: 0.54862 ** [JOINT LOSS] ** : 1.229398 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001999 | Grad Max: 0.041426 -> Layer: shared_layers.0.bias | Grad Mean: 0.039534 | Grad Max: 0.215609 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001943 | Grad Max: 0.011132 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022018 | Grad Max: 0.022018 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000370 | Grad Max: 0.009865 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007002 | Grad Max: 0.053411 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000113 | Grad Max: 0.002721 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003349 | Grad Max: 0.012343 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000671 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001228 | Grad Max: 0.003901 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000603 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000679 | Grad Max: 0.002395 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002749 | Grad Max: 0.006975 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029070 | Grad Max: 0.029070 [GRADIENT NORM TOTAL] 0.8456 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.172 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5149608 0.4850392] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 3/2045 | B: 76/1972 | C: 197/1851 [LOSS Ex1] A: 0.68926 | B: 0.69117 | C: 0.68820 [LOGITS Ex2 A] Mean Abs: 0.516 | Max: 2.476 [LOSS Ex2] A: 0.54233 | B: 0.55872 | C: 0.54988 ** [JOINT LOSS] ** : 1.239854 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001847 | Grad Max: 0.029661 -> Layer: shared_layers.0.bias | Grad Mean: 0.056954 | Grad Max: 0.282872 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001877 | Grad Max: 0.010159 -> Layer: exit1_layers.0.bias | Grad Mean: 0.026383 | Grad Max: 0.026383 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000463 | Grad Max: 0.012564 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008913 | Grad Max: 0.064027 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000143 | Grad Max: 0.003219 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004345 | Grad Max: 0.014900 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000818 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001574 | Grad Max: 0.004405 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000764 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000869 | Grad Max: 0.002851 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003551 | Grad Max: 0.010100 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037620 | Grad Max: 0.037620 [GRADIENT NORM TOTAL] 1.1269 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.088 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50345844 0.49654156] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 69/1979 | C: 217/1831 [LOSS Ex1] A: 0.00000 | B: 0.69122 | C: 0.68647 [LOGITS Ex2 A] Mean Abs: 0.510 | Max: 2.624 [LOSS Ex2] A: 0.53575 | B: 0.53694 | C: 0.55044 ** [JOINT LOSS] ** : 1.000271 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001301 | Grad Max: 0.026322 -> Layer: shared_layers.0.bias | Grad Mean: 0.022319 | Grad Max: 0.129022 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001107 | Grad Max: 0.004770 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015646 | Grad Max: 0.015646 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000209 | Grad Max: 0.007134 -> Layer: exit2_layers.0.bias | Grad Mean: 0.003480 | Grad Max: 0.032073 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.001656 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001516 | Grad Max: 0.006819 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000314 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000551 | Grad Max: 0.001744 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000334 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000302 | Grad Max: 0.001197 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001266 | Grad Max: 0.003567 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013610 | Grad Max: 0.013610 [GRADIENT NORM TOTAL] 0.4583 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.104 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50323445 0.49676552] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/1616 | B: 57/1991 | C: 240/1808 [LOSS Ex1] A: 0.00000 | B: 0.68988 | C: 0.68536 [LOGITS Ex2 A] Mean Abs: 0.534 | Max: 2.748 [LOSS Ex2] A: 0.53978 | B: 0.52962 | C: 0.54905 ** [JOINT LOSS] ** : 0.997897 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001810 | Grad Max: 0.029432 -> Layer: shared_layers.0.bias | Grad Mean: 0.043142 | Grad Max: 0.207854 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001093 | Grad Max: 0.005224 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015054 | Grad Max: 0.015054 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000409 | Grad Max: 0.010749 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006899 | Grad Max: 0.058903 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000115 | Grad Max: 0.003248 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003382 | Grad Max: 0.014819 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000674 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001231 | Grad Max: 0.003422 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000557 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000681 | Grad Max: 0.002204 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002927 | Grad Max: 0.006929 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030691 | Grad Max: 0.030691 [GRADIENT NORM TOTAL] 0.8801 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.168 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5177475 0.48225248] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 69/1787 | C: 170/1878 [LOSS Ex1] A: 0.68870 | B: 0.69128 | C: 0.68741 [LOGITS Ex2 A] Mean Abs: 0.533 | Max: 2.698 [LOSS Ex2] A: 0.52806 | B: 0.52539 | C: 0.54068 ** [JOINT LOSS] ** : 1.220505 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001407 | Grad Max: 0.025096 -> Layer: shared_layers.0.bias | Grad Mean: 0.012316 | Grad Max: 0.083373 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001932 | Grad Max: 0.010945 -> Layer: exit1_layers.0.bias | Grad Mean: 0.024072 | Grad Max: 0.024072 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000119 | Grad Max: 0.005272 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001947 | Grad Max: 0.024667 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000030 | Grad Max: 0.001212 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000790 | Grad Max: 0.005084 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000279 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000278 | Grad Max: 0.001275 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000257 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000150 | Grad Max: 0.000836 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000552 | Grad Max: 0.002175 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006103 | Grad Max: 0.006103 [GRADIENT NORM TOTAL] 0.2865 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.166 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5117532 0.48824683] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 72/1976 | C: 211/1837 [LOSS Ex1] A: 0.00000 | B: 0.69100 | C: 0.68737 [LOGITS Ex2 A] Mean Abs: 0.528 | Max: 2.839 [LOSS Ex2] A: 0.53229 | B: 0.54058 | C: 0.55472 ** [JOINT LOSS] ** : 1.001987 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001646 | Grad Max: 0.028318 -> Layer: shared_layers.0.bias | Grad Mean: 0.027881 | Grad Max: 0.143415 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001137 | Grad Max: 0.004745 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018202 | Grad Max: 0.018202 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000235 | Grad Max: 0.009094 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004617 | Grad Max: 0.040848 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000071 | Grad Max: 0.001640 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002155 | Grad Max: 0.008160 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000404 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000776 | Grad Max: 0.002319 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000480 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000421 | Grad Max: 0.001525 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001631 | Grad Max: 0.004545 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018164 | Grad Max: 0.018164 [GRADIENT NORM TOTAL] 0.5671 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.178 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.517357 0.48264304] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 2/2046 | B: 67/1981 | C: 213/1835 [LOSS Ex1] A: 0.68791 | B: 0.69106 | C: 0.68671 [LOGITS Ex2 A] Mean Abs: 0.524 | Max: 2.723 [LOSS Ex2] A: 0.51980 | B: 0.52816 | C: 0.52973 ** [JOINT LOSS] ** : 1.214457 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001393 | Grad Max: 0.025935 -> Layer: shared_layers.0.bias | Grad Mean: 0.007804 | Grad Max: 0.080741 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001967 | Grad Max: 0.010728 -> Layer: exit1_layers.0.bias | Grad Mean: 0.020026 | Grad Max: 0.020026 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000104 | Grad Max: 0.007253 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001319 | Grad Max: 0.029377 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000020 | Grad Max: 0.001106 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000337 | Grad Max: 0.003620 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000174 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000100 | Grad Max: 0.000604 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000183 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000054 | Grad Max: 0.000456 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000443 | Grad Max: 0.001934 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000004 | Grad Max: 0.000004 [GRADIENT NORM TOTAL] 0.2304 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.162 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51819885 0.48180115] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 53/1995 | C: 182/1866 [LOSS Ex1] A: 0.68786 | B: 0.68971 | C: 0.68685 [LOGITS Ex2 A] Mean Abs: 0.525 | Max: 2.714 [LOSS Ex2] A: 0.53753 | B: 0.52880 | C: 0.52948 ** [JOINT LOSS] ** : 1.220075 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001734 | Grad Max: 0.036972 -> Layer: shared_layers.0.bias | Grad Mean: 0.024904 | Grad Max: 0.124580 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001629 | Grad Max: 0.009148 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011759 | Grad Max: 0.011759 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000266 | Grad Max: 0.007358 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004314 | Grad Max: 0.038450 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.001992 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002015 | Grad Max: 0.008547 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000519 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000731 | Grad Max: 0.002222 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000421 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000398 | Grad Max: 0.001500 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001793 | Grad Max: 0.005279 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018146 | Grad Max: 0.018146 [GRADIENT NORM TOTAL] 0.5517 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.136 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5069747 0.49302527] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 66/1790 | C: 224/1824 [LOSS Ex1] A: 0.00000 | B: 0.69111 | C: 0.68522 [LOGITS Ex2 A] Mean Abs: 0.510 | Max: 2.989 [LOSS Ex2] A: 0.53117 | B: 0.50943 | C: 0.50628 ** [JOINT LOSS] ** : 0.974403 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001237 | Grad Max: 0.022786 -> Layer: shared_layers.0.bias | Grad Mean: 0.014035 | Grad Max: 0.074890 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001093 | Grad Max: 0.005215 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013083 | Grad Max: 0.013083 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000162 | Grad Max: 0.006693 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002448 | Grad Max: 0.034534 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000038 | Grad Max: 0.001321 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001042 | Grad Max: 0.005002 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000316 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000378 | Grad Max: 0.001569 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000266 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000204 | Grad Max: 0.000830 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000974 | Grad Max: 0.003897 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009372 | Grad Max: 0.009372 [GRADIENT NORM TOTAL] 0.3323 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.155 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50258607 0.49741387] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 65/1983 | C: 211/1837 [LOSS Ex1] A: 0.68900 | B: 0.69084 | C: 0.68526 [LOGITS Ex2 A] Mean Abs: 0.538 | Max: 2.540 [LOSS Ex2] A: 0.52636 | B: 0.54178 | C: 0.53868 ** [JOINT LOSS] ** : 1.223968 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001566 | Grad Max: 0.032212 -> Layer: shared_layers.0.bias | Grad Mean: 0.038920 | Grad Max: 0.196193 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001948 | Grad Max: 0.010794 -> Layer: exit1_layers.0.bias | Grad Mean: 0.020778 | Grad Max: 0.020778 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000323 | Grad Max: 0.009536 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006227 | Grad Max: 0.046921 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000096 | Grad Max: 0.002522 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002908 | Grad Max: 0.010706 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000503 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001017 | Grad Max: 0.002874 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000490 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000554 | Grad Max: 0.001923 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002143 | Grad Max: 0.005652 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023606 | Grad Max: 0.023606 [GRADIENT NORM TOTAL] 0.7773 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.172 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51421845 0.48578158] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 65/1983 | C: 116/1260 [LOSS Ex1] A: 0.68893 | B: 0.69090 | C: 0.68609 [LOGITS Ex2 A] Mean Abs: 0.533 | Max: 2.935 [LOSS Ex2] A: 0.53475 | B: 0.52902 | C: 0.53377 ** [JOINT LOSS] ** : 1.221152 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001288 | Grad Max: 0.023680 -> Layer: shared_layers.0.bias | Grad Mean: 0.019156 | Grad Max: 0.079529 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001734 | Grad Max: 0.010109 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015869 | Grad Max: 0.015869 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000165 | Grad Max: 0.007734 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002964 | Grad Max: 0.044326 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.001484 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001200 | Grad Max: 0.005424 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000349 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000406 | Grad Max: 0.001926 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000258 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000222 | Grad Max: 0.000964 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000797 | Grad Max: 0.002606 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008944 | Grad Max: 0.008944 [GRADIENT NORM TOTAL] 0.3853 [EPOCH SUMMARY] Train Loss: 1.1428 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.1291 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.1823 -> New: 1.1291) ############################## EPOCH 6/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.088 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50534266 0.49465734] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 50/1998 | C: 218/1830 [LOSS Ex1] A: 0.00000 | B: 0.68955 | C: 0.68605 [LOGITS Ex2 A] Mean Abs: 0.532 | Max: 2.744 [LOSS Ex2] A: 0.51887 | B: 0.52301 | C: 0.52195 ** [JOINT LOSS] ** : 0.979809 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001504 | Grad Max: 0.024302 -> Layer: shared_layers.0.bias | Grad Mean: 0.024432 | Grad Max: 0.110647 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001114 | Grad Max: 0.005264 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014180 | Grad Max: 0.014180 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000258 | Grad Max: 0.007922 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004176 | Grad Max: 0.041863 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000067 | Grad Max: 0.001754 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001914 | Grad Max: 0.008169 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000468 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000688 | Grad Max: 0.002397 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000432 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000374 | Grad Max: 0.001487 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001634 | Grad Max: 0.004935 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016991 | Grad Max: 0.016991 [GRADIENT NORM TOTAL] 0.5179 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.104 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50204796 0.49795207] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/1616 | B: 62/1794 | C: 176/1872 [LOSS Ex1] A: 0.00000 | B: 0.69095 | C: 0.68787 [LOGITS Ex2 A] Mean Abs: 0.545 | Max: 2.733 [LOSS Ex2] A: 0.52642 | B: 0.51226 | C: 0.52560 ** [JOINT LOSS] ** : 0.981033 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001465 | Grad Max: 0.024768 -> Layer: shared_layers.0.bias | Grad Mean: 0.031608 | Grad Max: 0.147032 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001136 | Grad Max: 0.004784 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017543 | Grad Max: 0.017543 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000294 | Grad Max: 0.007647 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005052 | Grad Max: 0.038045 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.002148 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002369 | Grad Max: 0.010120 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000484 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000842 | Grad Max: 0.002649 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000511 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000454 | Grad Max: 0.002058 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001942 | Grad Max: 0.006403 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020587 | Grad Max: 0.020587 [GRADIENT NORM TOTAL] 0.6385 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.171 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51697916 0.4830209 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 62/1986 | C: 194/1854 [LOSS Ex1] A: 0.68831 | B: 0.69067 | C: 0.68681 [LOGITS Ex2 A] Mean Abs: 0.553 | Max: 2.669 [LOSS Ex2] A: 0.51743 | B: 0.53257 | C: 0.53365 ** [JOINT LOSS] ** : 1.216480 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001815 | Grad Max: 0.043466 -> Layer: shared_layers.0.bias | Grad Mean: 0.040945 | Grad Max: 0.217127 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.011166 -> Layer: exit1_layers.0.bias | Grad Mean: 0.025102 | Grad Max: 0.025102 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000365 | Grad Max: 0.010041 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007057 | Grad Max: 0.054933 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.002772 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003326 | Grad Max: 0.012134 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000566 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001159 | Grad Max: 0.003187 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000515 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000624 | Grad Max: 0.001930 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002515 | Grad Max: 0.006985 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026766 | Grad Max: 0.026766 [GRADIENT NORM TOTAL] 0.8544 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.170 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5111431 0.4888569] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 63/1985 | C: 230/1818 [LOSS Ex1] A: 0.00000 | B: 0.69074 | C: 0.68387 [LOGITS Ex2 A] Mean Abs: 0.542 | Max: 2.729 [LOSS Ex2] A: 0.51692 | B: 0.51935 | C: 0.52354 ** [JOINT LOSS] ** : 0.978143 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001310 | Grad Max: 0.024196 -> Layer: shared_layers.0.bias | Grad Mean: 0.009203 | Grad Max: 0.074947 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001065 | Grad Max: 0.005386 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009082 | Grad Max: 0.009082 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000109 | Grad Max: 0.008875 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001661 | Grad Max: 0.048021 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000026 | Grad Max: 0.001368 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000593 | Grad Max: 0.004878 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000318 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000187 | Grad Max: 0.001104 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000236 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000100 | Grad Max: 0.000644 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000481 | Grad Max: 0.002187 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003759 | Grad Max: 0.003759 [GRADIENT NORM TOTAL] 0.2543 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.178 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5165047 0.4834953] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 2/2046 | B: 50/1998 | C: 184/1864 [LOSS Ex1] A: 0.68752 | B: 0.68938 | C: 0.68212 [LOGITS Ex2 A] Mean Abs: 0.531 | Max: 2.774 [LOSS Ex2] A: 0.52017 | B: 0.51610 | C: 0.50755 ** [JOINT LOSS] ** : 1.200948 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001576 | Grad Max: 0.023489 -> Layer: shared_layers.0.bias | Grad Mean: 0.031091 | Grad Max: 0.143981 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002027 | Grad Max: 0.010495 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008827 | Grad Max: 0.008827 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000293 | Grad Max: 0.011097 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004732 | Grad Max: 0.046883 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.002117 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002177 | Grad Max: 0.009072 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000410 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000752 | Grad Max: 0.002236 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000393 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000406 | Grad Max: 0.001514 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001822 | Grad Max: 0.005220 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018876 | Grad Max: 0.018876 [GRADIENT NORM TOTAL] 0.6187 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.161 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51741415 0.48258588] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 63/1793 | C: 216/1832 [LOSS Ex1] A: 0.68758 | B: 0.69078 | C: 0.68455 [LOGITS Ex2 A] Mean Abs: 0.530 | Max: 2.836 [LOSS Ex2] A: 0.52892 | B: 0.50219 | C: 0.51742 ** [JOINT LOSS] ** : 1.203812 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001387 | Grad Max: 0.028221 -> Layer: shared_layers.0.bias | Grad Mean: 0.015110 | Grad Max: 0.074633 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001676 | Grad Max: 0.009343 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011110 | Grad Max: 0.011110 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000172 | Grad Max: 0.007887 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002604 | Grad Max: 0.033906 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.001546 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001122 | Grad Max: 0.006333 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000284 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000394 | Grad Max: 0.001445 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000333 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000212 | Grad Max: 0.001014 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001066 | Grad Max: 0.003673 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010485 | Grad Max: 0.010485 [GRADIENT NORM TOTAL] 0.3593 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.136 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.506132 0.493868] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 62/1986 | C: 184/1864 [LOSS Ex1] A: 0.00000 | B: 0.69050 | C: 0.68904 [LOGITS Ex2 A] Mean Abs: 0.533 | Max: 2.692 [LOSS Ex2] A: 0.51851 | B: 0.53078 | C: 0.50655 ** [JOINT LOSS] ** : 0.978462 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001661 | Grad Max: 0.042217 -> Layer: shared_layers.0.bias | Grad Mean: 0.052706 | Grad Max: 0.276055 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001530 | Grad Max: 0.006034 -> Layer: exit1_layers.0.bias | Grad Mean: 0.028762 | Grad Max: 0.028762 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000452 | Grad Max: 0.012068 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008541 | Grad Max: 0.066558 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.002860 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004000 | Grad Max: 0.013394 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000700 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001363 | Grad Max: 0.003615 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000577 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000725 | Grad Max: 0.002489 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002840 | Grad Max: 0.006864 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030730 | Grad Max: 0.030730 [GRADIENT NORM TOTAL] 1.0436 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.157 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50455683 0.49544317] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 63/1985 | C: 210/1838 [LOSS Ex1] A: 0.68857 | B: 0.69058 | C: 0.68616 [LOGITS Ex2 A] Mean Abs: 0.542 | Max: 2.933 [LOSS Ex2] A: 0.51831 | B: 0.51584 | C: 0.50813 ** [JOINT LOSS] ** : 1.202531 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001241 | Grad Max: 0.021107 -> Layer: shared_layers.0.bias | Grad Mean: 0.009940 | Grad Max: 0.048633 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002041 | Grad Max: 0.011028 -> Layer: exit1_layers.0.bias | Grad Mean: 0.024833 | Grad Max: 0.024833 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000133 | Grad Max: 0.006888 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001765 | Grad Max: 0.038353 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000027 | Grad Max: 0.001258 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000617 | Grad Max: 0.004247 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000322 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000204 | Grad Max: 0.001011 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000210 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000107 | Grad Max: 0.000690 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000703 | Grad Max: 0.003266 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005321 | Grad Max: 0.005321 [GRADIENT NORM TOTAL] 0.2583 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.173 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5135595 0.48644048] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 50/1998 | C: 162/1886 [LOSS Ex1] A: 0.68860 | B: 0.68921 | C: 0.68684 [LOGITS Ex2 A] Mean Abs: 0.547 | Max: 3.003 [LOSS Ex2] A: 0.53199 | B: 0.50768 | C: 0.52897 ** [JOINT LOSS] ** : 1.211098 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.049757 -> Layer: shared_layers.0.bias | Grad Mean: 0.033185 | Grad Max: 0.158825 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001761 | Grad Max: 0.009777 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019749 | Grad Max: 0.019749 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000352 | Grad Max: 0.009325 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005900 | Grad Max: 0.048400 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000095 | Grad Max: 0.002409 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002782 | Grad Max: 0.011746 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000511 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000954 | Grad Max: 0.002818 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000428 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000511 | Grad Max: 0.001712 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002235 | Grad Max: 0.005598 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022687 | Grad Max: 0.022687 [GRADIENT NORM TOTAL] 0.7205 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.087 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50723404 0.49276593] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 66/1790 | C: 177/1871 [LOSS Ex1] A: 0.00000 | B: 0.69062 | C: 0.68639 [LOGITS Ex2 A] Mean Abs: 0.552 | Max: 2.933 [LOSS Ex2] A: 0.50671 | B: 0.49487 | C: 0.51747 ** [JOINT LOSS] ** : 0.965351 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001222 | Grad Max: 0.020312 -> Layer: shared_layers.0.bias | Grad Mean: 0.007300 | Grad Max: 0.060675 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001045 | Grad Max: 0.004511 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015426 | Grad Max: 0.015426 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000114 | Grad Max: 0.006272 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001371 | Grad Max: 0.035143 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000023 | Grad Max: 0.001073 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000434 | Grad Max: 0.004028 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000241 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000136 | Grad Max: 0.001148 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000191 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000069 | Grad Max: 0.000440 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000686 | Grad Max: 0.002489 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002752 | Grad Max: 0.002752 [GRADIENT NORM TOTAL] 0.2242 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.104 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008671 0.4991329] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 0/1616 | B: 64/1984 | C: 205/1843 [LOSS Ex1] A: 0.00000 | B: 0.69033 | C: 0.68556 [LOGITS Ex2 A] Mean Abs: 0.579 | Max: 2.948 [LOSS Ex2] A: 0.50062 | B: 0.52490 | C: 0.52674 ** [JOINT LOSS] ** : 0.976053 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001574 | Grad Max: 0.032614 -> Layer: shared_layers.0.bias | Grad Mean: 0.025411 | Grad Max: 0.120497 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001072 | Grad Max: 0.004960 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013263 | Grad Max: 0.013263 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000239 | Grad Max: 0.011582 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004555 | Grad Max: 0.062425 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000067 | Grad Max: 0.001959 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002009 | Grad Max: 0.007686 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000410 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000679 | Grad Max: 0.002152 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000363 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000353 | Grad Max: 0.001473 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001350 | Grad Max: 0.004108 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014377 | Grad Max: 0.014377 [GRADIENT NORM TOTAL] 0.5421 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.174 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51625067 0.48374933] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 63/1985 | C: 192/1856 [LOSS Ex1] A: 0.68794 | B: 0.69042 | C: 0.68523 [LOGITS Ex2 A] Mean Abs: 0.574 | Max: 2.860 [LOSS Ex2] A: 0.51524 | B: 0.50739 | C: 0.51994 ** [JOINT LOSS] ** : 1.202051 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001659 | Grad Max: 0.032891 -> Layer: shared_layers.0.bias | Grad Mean: 0.033307 | Grad Max: 0.175288 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001833 | Grad Max: 0.010178 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016216 | Grad Max: 0.016216 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000322 | Grad Max: 0.010280 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005918 | Grad Max: 0.048324 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.002545 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002706 | Grad Max: 0.010407 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000515 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000903 | Grad Max: 0.002652 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000466 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000476 | Grad Max: 0.001786 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001811 | Grad Max: 0.004959 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019596 | Grad Max: 0.019596 [GRADIENT NORM TOTAL] 0.7036 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.173 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105901 0.4894099] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 50/1998 | C: 209/1839 [LOSS Ex1] A: 0.00000 | B: 0.68904 | C: 0.68522 [LOGITS Ex2 A] Mean Abs: 0.561 | Max: 2.874 [LOSS Ex2] A: 0.51521 | B: 0.50047 | C: 0.50763 ** [JOINT LOSS] ** : 0.965858 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001507 | Grad Max: 0.024436 -> Layer: shared_layers.0.bias | Grad Mean: 0.036664 | Grad Max: 0.171374 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001037 | Grad Max: 0.005243 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011228 | Grad Max: 0.011228 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000326 | Grad Max: 0.010578 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005621 | Grad Max: 0.055375 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.002047 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002660 | Grad Max: 0.009036 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000579 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000897 | Grad Max: 0.002561 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000415 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000468 | Grad Max: 0.001695 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001989 | Grad Max: 0.005744 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020598 | Grad Max: 0.020598 [GRADIENT NORM TOTAL] 0.7172 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.179 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5156869 0.48431307] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 66/1790 | C: 125/1251 [LOSS Ex1] A: 0.68713 | B: 0.69045 | C: 0.68487 [LOGITS Ex2 A] Mean Abs: 0.563 | Max: 3.009 [LOSS Ex2] A: 0.50511 | B: 0.49842 | C: 0.52784 ** [JOINT LOSS] ** : 1.197941 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001366 | Grad Max: 0.027064 -> Layer: shared_layers.0.bias | Grad Mean: 0.044123 | Grad Max: 0.216921 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001943 | Grad Max: 0.011184 -> Layer: exit1_layers.0.bias | Grad Mean: 0.020557 | Grad Max: 0.020557 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000376 | Grad Max: 0.010977 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006635 | Grad Max: 0.052783 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000101 | Grad Max: 0.002527 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003108 | Grad Max: 0.012447 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000571 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001037 | Grad Max: 0.003253 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000530 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000546 | Grad Max: 0.001870 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002140 | Grad Max: 0.005906 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023866 | Grad Max: 0.023866 [GRADIENT NORM TOTAL] 0.8545 [EPOCH SUMMARY] Train Loss: 1.0900 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.1453 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 7/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.163 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5166547 0.48334527] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 66/1982 | C: 193/1855 [LOSS Ex1] A: 0.68731 | B: 0.69017 | C: 0.68463 [LOGITS Ex2 A] Mean Abs: 0.572 | Max: 2.975 [LOSS Ex2] A: 0.50837 | B: 0.51307 | C: 0.51282 ** [JOINT LOSS] ** : 1.198791 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001222 | Grad Max: 0.026620 -> Layer: shared_layers.0.bias | Grad Mean: 0.024982 | Grad Max: 0.129007 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001640 | Grad Max: 0.009119 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010496 | Grad Max: 0.010496 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000216 | Grad Max: 0.013134 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004005 | Grad Max: 0.071642 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.001722 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001736 | Grad Max: 0.007027 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000400 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000570 | Grad Max: 0.001931 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000318 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000296 | Grad Max: 0.001316 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001069 | Grad Max: 0.002983 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011836 | Grad Max: 0.011836 [GRADIENT NORM TOTAL] 0.5000 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.135 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5052826 0.49471748] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 63/1985 | C: 190/1858 [LOSS Ex1] A: 0.00000 | B: 0.69026 | C: 0.68585 [LOGITS Ex2 A] Mean Abs: 0.571 | Max: 2.957 [LOSS Ex2] A: 0.51065 | B: 0.50141 | C: 0.51224 ** [JOINT LOSS] ** : 0.966803 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001680 | Grad Max: 0.043896 -> Layer: shared_layers.0.bias | Grad Mean: 0.047891 | Grad Max: 0.267380 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001059 | Grad Max: 0.005079 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012714 | Grad Max: 0.012714 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000428 | Grad Max: 0.013223 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008072 | Grad Max: 0.069017 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000123 | Grad Max: 0.002682 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003720 | Grad Max: 0.012898 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000588 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001213 | Grad Max: 0.003378 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000586 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000625 | Grad Max: 0.002188 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002389 | Grad Max: 0.005867 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025739 | Grad Max: 0.025739 [GRADIENT NORM TOTAL] 0.9687 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.160 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50645214 0.49354786] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 51/1997 | C: 196/1852 [LOSS Ex1] A: 0.68816 | B: 0.68887 | C: 0.68516 [LOGITS Ex2 A] Mean Abs: 0.586 | Max: 3.179 [LOSS Ex2] A: 0.50405 | B: 0.49673 | C: 0.50634 ** [JOINT LOSS] ** : 1.189768 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001239 | Grad Max: 0.022194 -> Layer: shared_layers.0.bias | Grad Mean: 0.011635 | Grad Max: 0.085967 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.011082 -> Layer: exit1_layers.0.bias | Grad Mean: 0.021818 | Grad Max: 0.021818 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000137 | Grad Max: 0.007208 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001864 | Grad Max: 0.035852 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000027 | Grad Max: 0.001233 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000680 | Grad Max: 0.004415 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000255 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000231 | Grad Max: 0.001186 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000216 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000124 | Grad Max: 0.000666 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000678 | Grad Max: 0.003018 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006113 | Grad Max: 0.006113 [GRADIENT NORM TOTAL] 0.2799 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.177 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5129169 0.48708302] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 66/1790 | C: 179/1869 [LOSS Ex1] A: 0.68828 | B: 0.69028 | C: 0.68543 [LOGITS Ex2 A] Mean Abs: 0.587 | Max: 3.082 [LOSS Ex2] A: 0.50889 | B: 0.49214 | C: 0.53162 ** [JOINT LOSS] ** : 1.198883 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002462 | Grad Max: 0.070061 -> Layer: shared_layers.0.bias | Grad Mean: 0.066738 | Grad Max: 0.349259 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001726 | Grad Max: 0.009808 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017326 | Grad Max: 0.017326 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000639 | Grad Max: 0.019292 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011404 | Grad Max: 0.099436 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000179 | Grad Max: 0.003691 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005437 | Grad Max: 0.018404 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000864 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001784 | Grad Max: 0.004580 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000654 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000921 | Grad Max: 0.002751 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003748 | Grad Max: 0.009266 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039598 | Grad Max: 0.039598 [GRADIENT NORM TOTAL] 1.3750 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5090695 0.4909305] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 66/1982 | C: 193/1855 [LOSS Ex1] A: 0.00000 | B: 0.69000 | C: 0.68719 [LOGITS Ex2 A] Mean Abs: 0.588 | Max: 3.094 [LOSS Ex2] A: 0.50313 | B: 0.50869 | C: 0.49909 ** [JOINT LOSS] ** : 0.962699 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001373 | Grad Max: 0.022430 -> Layer: shared_layers.0.bias | Grad Mean: 0.009645 | Grad Max: 0.068038 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001366 | Grad Max: 0.005781 -> Layer: exit1_layers.0.bias | Grad Mean: 0.024995 | Grad Max: 0.024995 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000137 | Grad Max: 0.009804 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001892 | Grad Max: 0.050665 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000028 | Grad Max: 0.001465 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000661 | Grad Max: 0.005323 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000242 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000220 | Grad Max: 0.001335 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000111 | Grad Max: 0.000727 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000593 | Grad Max: 0.003046 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004892 | Grad Max: 0.004892 [GRADIENT NORM TOTAL] 0.2832 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.104 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002838 0.49971628] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/1616 | B: 62/1986 | C: 185/1863 [LOSS Ex1] A: 0.00000 | B: 0.69010 | C: 0.68523 [LOGITS Ex2 A] Mean Abs: 0.633 | Max: 3.183 [LOSS Ex2] A: 0.49007 | B: 0.50930 | C: 0.50278 ** [JOINT LOSS] ** : 0.959159 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001581 | Grad Max: 0.026183 -> Layer: shared_layers.0.bias | Grad Mean: 0.045153 | Grad Max: 0.227735 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001102 | Grad Max: 0.005113 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012787 | Grad Max: 0.012787 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000390 | Grad Max: 0.016125 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007410 | Grad Max: 0.089090 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.003029 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003400 | Grad Max: 0.012536 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000589 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001078 | Grad Max: 0.003361 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000417 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000545 | Grad Max: 0.001677 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001929 | Grad Max: 0.004586 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021621 | Grad Max: 0.021621 [GRADIENT NORM TOTAL] 0.8842 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.178 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51558936 0.48441064] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 51/1997 | C: 189/1859 [LOSS Ex1] A: 0.68757 | B: 0.68870 | C: 0.68444 [LOGITS Ex2 A] Mean Abs: 0.612 | Max: 3.102 [LOSS Ex2] A: 0.50618 | B: 0.49837 | C: 0.50648 ** [JOINT LOSS] ** : 1.190575 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001502 | Grad Max: 0.031811 -> Layer: shared_layers.0.bias | Grad Mean: 0.038535 | Grad Max: 0.217347 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001861 | Grad Max: 0.010432 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016209 | Grad Max: 0.016209 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000324 | Grad Max: 0.017731 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006201 | Grad Max: 0.094083 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.002458 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002785 | Grad Max: 0.010429 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000538 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000885 | Grad Max: 0.002824 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000350 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000450 | Grad Max: 0.001491 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001601 | Grad Max: 0.004025 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017845 | Grad Max: 0.017845 [GRADIENT NORM TOTAL] 0.7619 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.177 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5100559 0.48994413] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 67/1789 | C: 179/1869 [LOSS Ex1] A: 0.68775 | B: 0.69012 | C: 0.68622 [LOGITS Ex2 A] Mean Abs: 0.604 | Max: 3.136 [LOSS Ex2] A: 0.49914 | B: 0.49513 | C: 0.48691 ** [JOINT LOSS] ** : 1.181755 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001963 | Grad Max: 0.044163 -> Layer: shared_layers.0.bias | Grad Mean: 0.057964 | Grad Max: 0.304553 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.011262 -> Layer: exit1_layers.0.bias | Grad Mean: 0.023109 | Grad Max: 0.023109 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000525 | Grad Max: 0.018794 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009486 | Grad Max: 0.098453 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000148 | Grad Max: 0.003358 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004557 | Grad Max: 0.015738 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000627 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001470 | Grad Max: 0.003815 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000586 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000747 | Grad Max: 0.002428 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002950 | Grad Max: 0.007537 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032059 | Grad Max: 0.032059 [GRADIENT NORM TOTAL] 1.1652 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.181 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51494133 0.48505864] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 66/1982 | C: 175/1873 [LOSS Ex1] A: 0.68674 | B: 0.68983 | C: 0.68639 [LOGITS Ex2 A] Mean Abs: 0.595 | Max: 3.159 [LOSS Ex2] A: 0.50041 | B: 0.50842 | C: 0.49749 ** [JOINT LOSS] ** : 1.189760 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001371 | Grad Max: 0.036705 -> Layer: shared_layers.0.bias | Grad Mean: 0.053471 | Grad Max: 0.276733 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.011418 -> Layer: exit1_layers.0.bias | Grad Mean: 0.021426 | Grad Max: 0.021426 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000443 | Grad Max: 0.014052 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007993 | Grad Max: 0.071995 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000123 | Grad Max: 0.002685 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003821 | Grad Max: 0.013537 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000581 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001213 | Grad Max: 0.003302 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000517 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000616 | Grad Max: 0.002098 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002436 | Grad Max: 0.006151 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026750 | Grad Max: 0.026750 [GRADIENT NORM TOTAL] 1.0363 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.166 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5159615 0.4840385] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 62/1986 | C: 182/1866 [LOSS Ex1] A: 0.68703 | B: 0.68994 | C: 0.68545 [LOGITS Ex2 A] Mean Abs: 0.613 | Max: 2.895 [LOSS Ex2] A: 0.49483 | B: 0.50625 | C: 0.50717 ** [JOINT LOSS] ** : 1.190224 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001954 | Grad Max: 0.042219 -> Layer: shared_layers.0.bias | Grad Mean: 0.071472 | Grad Max: 0.401655 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001696 | Grad Max: 0.009181 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014394 | Grad Max: 0.014394 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000624 | Grad Max: 0.018347 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011893 | Grad Max: 0.102828 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000182 | Grad Max: 0.004153 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005606 | Grad Max: 0.018722 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000781 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001763 | Grad Max: 0.004806 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000711 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000885 | Grad Max: 0.002850 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003294 | Grad Max: 0.007399 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035928 | Grad Max: 0.035928 [GRADIENT NORM TOTAL] 1.4308 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.135 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50444365 0.4955564 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 52/1996 | C: 160/1888 [LOSS Ex1] A: 0.00000 | B: 0.68852 | C: 0.68578 [LOGITS Ex2 A] Mean Abs: 0.588 | Max: 3.021 [LOSS Ex2] A: 0.50447 | B: 0.49194 | C: 0.49586 ** [JOINT LOSS] ** : 0.955524 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001936 | Grad Max: 0.044187 -> Layer: shared_layers.0.bias | Grad Mean: 0.054867 | Grad Max: 0.285694 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001067 | Grad Max: 0.005091 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011222 | Grad Max: 0.011222 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000507 | Grad Max: 0.016042 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009455 | Grad Max: 0.084474 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000142 | Grad Max: 0.003195 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004291 | Grad Max: 0.015217 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000643 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001347 | Grad Max: 0.003747 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000504 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000678 | Grad Max: 0.002153 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002516 | Grad Max: 0.005919 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027413 | Grad Max: 0.027413 [GRADIENT NORM TOTAL] 1.1047 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.163 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50835836 0.49164167] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 68/1788 | C: 208/1840 [LOSS Ex1] A: 0.68773 | B: 0.68995 | C: 0.68253 [LOGITS Ex2 A] Mean Abs: 0.606 | Max: 3.220 [LOSS Ex2] A: 0.50201 | B: 0.48693 | C: 0.47924 ** [JOINT LOSS] ** : 1.176134 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001582 | Grad Max: 0.036885 -> Layer: shared_layers.0.bias | Grad Mean: 0.051659 | Grad Max: 0.280329 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001893 | Grad Max: 0.010370 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015015 | Grad Max: 0.015015 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000463 | Grad Max: 0.015685 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008471 | Grad Max: 0.087075 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000130 | Grad Max: 0.003649 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004004 | Grad Max: 0.018398 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000545 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001262 | Grad Max: 0.003371 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000471 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000634 | Grad Max: 0.001970 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002516 | Grad Max: 0.006560 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027149 | Grad Max: 0.027149 [GRADIENT NORM TOTAL] 1.0450 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.022 | Max: 0.181 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5123 0.4877] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 67/1981 | C: 199/1849 [LOSS Ex1] A: 0.68794 | B: 0.68966 | C: 0.68361 [LOGITS Ex2 A] Mean Abs: 0.605 | Max: 3.082 [LOSS Ex2] A: 0.49829 | B: 0.51394 | C: 0.48242 ** [JOINT LOSS] ** : 1.185288 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.053429 -> Layer: shared_layers.0.bias | Grad Mean: 0.042927 | Grad Max: 0.220794 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001744 | Grad Max: 0.009463 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014881 | Grad Max: 0.014881 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000443 | Grad Max: 0.014232 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007623 | Grad Max: 0.070951 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000116 | Grad Max: 0.003213 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003520 | Grad Max: 0.014701 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000526 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001120 | Grad Max: 0.002974 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000510 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000561 | Grad Max: 0.001933 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002203 | Grad Max: 0.005778 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023724 | Grad Max: 0.023724 [GRADIENT NORM TOTAL] 0.8994 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5109639 0.48903605] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 62/1986 | C: 108/1268 [LOSS Ex1] A: 0.00000 | B: 0.68978 | C: 0.68594 [LOGITS Ex2 A] Mean Abs: 0.634 | Max: 3.071 [LOSS Ex2] A: 0.49336 | B: 0.49512 | C: 0.51424 ** [JOINT LOSS] ** : 0.959479 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001649 | Grad Max: 0.034500 -> Layer: shared_layers.0.bias | Grad Mean: 0.052616 | Grad Max: 0.294213 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001092 | Grad Max: 0.004875 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016009 | Grad Max: 0.016009 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000450 | Grad Max: 0.017203 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008520 | Grad Max: 0.095446 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000129 | Grad Max: 0.003364 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003944 | Grad Max: 0.013906 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000586 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001218 | Grad Max: 0.003567 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000444 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000608 | Grad Max: 0.001940 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002339 | Grad Max: 0.005538 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024794 | Grad Max: 0.024794 [GRADIENT NORM TOTAL] 1.0374 [EPOCH SUMMARY] Train Loss: 1.1075 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.1628 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 8/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.104 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014596 0.4985404] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/1616 | B: 53/1995 | C: 202/1846 [LOSS Ex1] A: 0.00000 | B: 0.68835 | C: 0.68522 [LOGITS Ex2 A] Mean Abs: 0.662 | Max: 3.136 [LOSS Ex2] A: 0.48063 | B: 0.49216 | C: 0.49440 ** [JOINT LOSS] ** : 0.946921 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001281 | Grad Max: 0.028011 -> Layer: shared_layers.0.bias | Grad Mean: 0.031994 | Grad Max: 0.170667 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001090 | Grad Max: 0.005328 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011797 | Grad Max: 0.011797 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000270 | Grad Max: 0.018238 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005030 | Grad Max: 0.099358 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.002314 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002199 | Grad Max: 0.008153 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000422 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000674 | Grad Max: 0.002407 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000295 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000333 | Grad Max: 0.001219 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001077 | Grad Max: 0.003396 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012529 | Grad Max: 0.012529 [GRADIENT NORM TOTAL] 0.6271 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.182 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51497704 0.48502293] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 69/1787 | C: 192/1856 [LOSS Ex1] A: 0.68718 | B: 0.68979 | C: 0.68401 [LOGITS Ex2 A] Mean Abs: 0.622 | Max: 3.213 [LOSS Ex2] A: 0.48330 | B: 0.48295 | C: 0.48954 ** [JOINT LOSS] ** : 1.172257 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001308 | Grad Max: 0.022284 -> Layer: shared_layers.0.bias | Grad Mean: 0.022813 | Grad Max: 0.105789 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001834 | Grad Max: 0.010660 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016261 | Grad Max: 0.016261 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000234 | Grad Max: 0.010593 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004003 | Grad Max: 0.049222 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000059 | Grad Max: 0.001733 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001776 | Grad Max: 0.007097 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000374 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000551 | Grad Max: 0.001747 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000278 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000272 | Grad Max: 0.001141 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001100 | Grad Max: 0.003844 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011973 | Grad Max: 0.011973 [GRADIENT NORM TOTAL] 0.4919 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.181 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5095605 0.49043947] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 68/1980 | C: 170/1878 [LOSS Ex1] A: 0.68733 | B: 0.68949 | C: 0.68494 [LOGITS Ex2 A] Mean Abs: 0.623 | Max: 3.363 [LOSS Ex2] A: 0.49127 | B: 0.50046 | C: 0.52903 ** [JOINT LOSS] ** : 1.194175 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001537 | Grad Max: 0.040065 -> Layer: shared_layers.0.bias | Grad Mean: 0.045669 | Grad Max: 0.247488 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001979 | Grad Max: 0.011131 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019835 | Grad Max: 0.019835 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000417 | Grad Max: 0.016754 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007281 | Grad Max: 0.083930 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.002887 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003318 | Grad Max: 0.011793 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000496 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001020 | Grad Max: 0.002941 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000434 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000511 | Grad Max: 0.001787 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002014 | Grad Max: 0.004731 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022336 | Grad Max: 0.022336 [GRADIENT NORM TOTAL] 0.9169 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.185 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5142704 0.48572958] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 62/1986 | C: 203/1845 [LOSS Ex1] A: 0.68632 | B: 0.68961 | C: 0.68397 [LOGITS Ex2 A] Mean Abs: 0.640 | Max: 3.064 [LOSS Ex2] A: 0.49366 | B: 0.48732 | C: 0.48080 ** [JOINT LOSS] ** : 1.173897 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001788 | Grad Max: 0.051340 -> Layer: shared_layers.0.bias | Grad Mean: 0.035261 | Grad Max: 0.196354 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001901 | Grad Max: 0.010620 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015471 | Grad Max: 0.015471 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000370 | Grad Max: 0.013684 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006649 | Grad Max: 0.072739 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000101 | Grad Max: 0.002562 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002996 | Grad Max: 0.011374 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000504 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000922 | Grad Max: 0.002904 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000364 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000454 | Grad Max: 0.001368 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001611 | Grad Max: 0.004018 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017602 | Grad Max: 0.017602 [GRADIENT NORM TOTAL] 0.7762 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.169 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51535636 0.4846436 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 2/2046 | B: 53/1995 | C: 167/1881 [LOSS Ex1] A: 0.68673 | B: 0.68818 | C: 0.68448 [LOGITS Ex2 A] Mean Abs: 0.641 | Max: 3.258 [LOSS Ex2] A: 0.48576 | B: 0.48379 | C: 0.49133 ** [JOINT LOSS] ** : 1.173422 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001243 | Grad Max: 0.026636 -> Layer: shared_layers.0.bias | Grad Mean: 0.027474 | Grad Max: 0.140770 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001733 | Grad Max: 0.009038 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007408 | Grad Max: 0.007408 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000247 | Grad Max: 0.013958 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004284 | Grad Max: 0.074683 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000053 | Grad Max: 0.002091 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001570 | Grad Max: 0.008109 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000325 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000460 | Grad Max: 0.001888 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000248 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000220 | Grad Max: 0.000966 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000648 | Grad Max: 0.002650 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007781 | Grad Max: 0.007781 [GRADIENT NORM TOTAL] 0.5283 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.136 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5036633 0.49633673] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 69/1787 | C: 184/1864 [LOSS Ex1] A: 0.00000 | B: 0.68962 | C: 0.68486 [LOGITS Ex2 A] Mean Abs: 0.604 | Max: 3.159 [LOSS Ex2] A: 0.48655 | B: 0.47878 | C: 0.48689 ** [JOINT LOSS] ** : 0.942236 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001444 | Grad Max: 0.034419 -> Layer: shared_layers.0.bias | Grad Mean: 0.032541 | Grad Max: 0.179268 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001141 | Grad Max: 0.005096 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015473 | Grad Max: 0.015473 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000324 | Grad Max: 0.011393 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005725 | Grad Max: 0.062115 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.002297 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002519 | Grad Max: 0.009383 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000390 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000787 | Grad Max: 0.002366 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000318 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000388 | Grad Max: 0.001387 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001561 | Grad Max: 0.004785 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016957 | Grad Max: 0.016957 [GRADIENT NORM TOTAL] 0.6806 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.167 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.510231 0.48976895] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 69/1979 | C: 190/1858 [LOSS Ex1] A: 0.68728 | B: 0.68932 | C: 0.68446 [LOGITS Ex2 A] Mean Abs: 0.648 | Max: 3.216 [LOSS Ex2] A: 0.48450 | B: 0.49997 | C: 0.49178 ** [JOINT LOSS] ** : 1.179105 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001100 | Grad Max: 0.017085 -> Layer: shared_layers.0.bias | Grad Mean: 0.007199 | Grad Max: 0.040773 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002035 | Grad Max: 0.010743 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022081 | Grad Max: 0.022081 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000121 | Grad Max: 0.016456 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001697 | Grad Max: 0.090866 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000023 | Grad Max: 0.001317 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000447 | Grad Max: 0.004513 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000181 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000113 | Grad Max: 0.000889 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000137 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000055 | Grad Max: 0.000490 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000310 | Grad Max: 0.001906 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001914 | Grad Max: 0.001914 [GRADIENT NORM TOTAL] 0.2546 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.186 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5117834 0.48821658] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 62/1986 | C: 155/1893 [LOSS Ex1] A: 0.68758 | B: 0.68945 | C: 0.68676 [LOGITS Ex2 A] Mean Abs: 0.651 | Max: 3.346 [LOSS Ex2] A: 0.48327 | B: 0.48818 | C: 0.47961 ** [JOINT LOSS] ** : 1.171619 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001312 | Grad Max: 0.024286 -> Layer: shared_layers.0.bias | Grad Mean: 0.024407 | Grad Max: 0.117108 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001782 | Grad Max: 0.009847 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019358 | Grad Max: 0.019358 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000216 | Grad Max: 0.016022 -> Layer: exit2_layers.0.bias | Grad Mean: 0.003588 | Grad Max: 0.081661 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.001729 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001316 | Grad Max: 0.006399 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000224 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000371 | Grad Max: 0.001341 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000207 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000177 | Grad Max: 0.000865 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000620 | Grad Max: 0.002684 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006071 | Grad Max: 0.006071 [GRADIENT NORM TOTAL] 0.4623 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5128648 0.4871352] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.004 [MASKS] A(Pass/Fail): 0/2048 | B: 56/1992 | C: 175/1873 [LOSS Ex1] A: 0.00000 | B: 0.68799 | C: 0.68488 [LOGITS Ex2 A] Mean Abs: 0.649 | Max: 3.266 [LOSS Ex2] A: 0.47169 | B: 0.47146 | C: 0.47290 ** [JOINT LOSS] ** : 0.929640 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001094 | Grad Max: 0.019162 -> Layer: shared_layers.0.bias | Grad Mean: 0.010405 | Grad Max: 0.062593 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001132 | Grad Max: 0.005233 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013734 | Grad Max: 0.013734 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000130 | Grad Max: 0.019016 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001840 | Grad Max: 0.106280 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000023 | Grad Max: 0.001097 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000433 | Grad Max: 0.003597 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000220 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000112 | Grad Max: 0.000923 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000168 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000053 | Grad Max: 0.000485 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000800 | Grad Max: 0.002571 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000533 | Grad Max: 0.000533 [GRADIENT NORM TOTAL] 0.2822 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.105 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5027151 0.49728492] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/1616 | B: 70/1786 | C: 172/1876 [LOSS Ex1] A: 0.00000 | B: 0.68945 | C: 0.68462 [LOGITS Ex2 A] Mean Abs: 0.670 | Max: 3.235 [LOSS Ex2] A: 0.46666 | B: 0.46988 | C: 0.46928 ** [JOINT LOSS] ** : 0.926626 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001393 | Grad Max: 0.023929 -> Layer: shared_layers.0.bias | Grad Mean: 0.031077 | Grad Max: 0.159449 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001140 | Grad Max: 0.005123 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015619 | Grad Max: 0.015619 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000301 | Grad Max: 0.012273 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005208 | Grad Max: 0.055818 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.002086 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002362 | Grad Max: 0.008478 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000408 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000719 | Grad Max: 0.002429 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000330 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000351 | Grad Max: 0.001432 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001376 | Grad Max: 0.005037 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015139 | Grad Max: 0.015139 [GRADIENT NORM TOTAL] 0.6396 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.187 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51444244 0.48555753] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 71/1977 | C: 179/1869 [LOSS Ex1] A: 0.68675 | B: 0.68914 | C: 0.68450 [LOGITS Ex2 A] Mean Abs: 0.682 | Max: 3.440 [LOSS Ex2] A: 0.47844 | B: 0.49233 | C: 0.47484 ** [JOINT LOSS] ** : 1.168669 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.054020 -> Layer: shared_layers.0.bias | Grad Mean: 0.053721 | Grad Max: 0.305412 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001928 | Grad Max: 0.010456 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019958 | Grad Max: 0.019958 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000501 | Grad Max: 0.018563 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009281 | Grad Max: 0.103803 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.003335 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004247 | Grad Max: 0.015061 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000574 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001248 | Grad Max: 0.003313 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000458 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000597 | Grad Max: 0.001832 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002152 | Grad Max: 0.004679 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023331 | Grad Max: 0.023331 [GRADIENT NORM TOTAL] 1.0954 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.186 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5090493 0.4909507] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.004 [MASKS] A(Pass/Fail): 1/2047 | B: 62/1986 | C: 165/1883 [LOSS Ex1] A: 0.68687 | B: 0.68927 | C: 0.68581 [LOGITS Ex2 A] Mean Abs: 0.664 | Max: 3.373 [LOSS Ex2] A: 0.46922 | B: 0.48119 | C: 0.49412 ** [JOINT LOSS] ** : 1.168828 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001204 | Grad Max: 0.028986 -> Layer: shared_layers.0.bias | Grad Mean: 0.031490 | Grad Max: 0.170629 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.010832 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022262 | Grad Max: 0.022262 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000301 | Grad Max: 0.010834 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005158 | Grad Max: 0.060355 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.001832 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002252 | Grad Max: 0.008203 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000362 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000675 | Grad Max: 0.002232 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000255 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000324 | Grad Max: 0.001086 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001246 | Grad Max: 0.003679 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014061 | Grad Max: 0.014061 [GRADIENT NORM TOTAL] 0.6440 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.190 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5136271 0.48637292] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 58/1990 | C: 182/1866 [LOSS Ex1] A: 0.68588 | B: 0.68780 | C: 0.68430 [LOGITS Ex2 A] Mean Abs: 0.657 | Max: 3.327 [LOSS Ex2] A: 0.47414 | B: 0.46875 | C: 0.47867 ** [JOINT LOSS] ** : 1.159848 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001656 | Grad Max: 0.032548 -> Layer: shared_layers.0.bias | Grad Mean: 0.022686 | Grad Max: 0.115095 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002013 | Grad Max: 0.010869 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016173 | Grad Max: 0.016173 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000253 | Grad Max: 0.015982 -> Layer: exit2_layers.0.bias | Grad Mean: 0.003576 | Grad Max: 0.085864 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.001574 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001197 | Grad Max: 0.007277 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000242 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000338 | Grad Max: 0.001266 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000219 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000163 | Grad Max: 0.000840 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000732 | Grad Max: 0.003101 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007667 | Grad Max: 0.007667 [GRADIENT NORM TOTAL] 0.4790 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.173 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5147493 0.48525068] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 2/2046 | B: 72/1784 | C: 143/1233 [LOSS Ex1] A: 0.68641 | B: 0.68926 | C: 0.68260 [LOGITS Ex2 A] Mean Abs: 0.661 | Max: 3.295 [LOSS Ex2] A: 0.47782 | B: 0.47439 | C: 0.48361 ** [JOINT LOSS] ** : 1.164697 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001144 | Grad Max: 0.022514 -> Layer: shared_layers.0.bias | Grad Mean: 0.014887 | Grad Max: 0.062222 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001728 | Grad Max: 0.009186 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008740 | Grad Max: 0.008740 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000157 | Grad Max: 0.014752 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002628 | Grad Max: 0.079948 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000034 | Grad Max: 0.001680 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000873 | Grad Max: 0.006229 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000325 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000236 | Grad Max: 0.001369 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000196 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000111 | Grad Max: 0.000577 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000485 | Grad Max: 0.002209 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003823 | Grad Max: 0.003823 [GRADIENT NORM TOTAL] 0.3359 [EPOCH SUMMARY] Train Loss: 1.1051 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.1435 | Alpha: 0.5500 No improve count: 3/15 ############################## EPOCH 9/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.137 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5027774 0.4972226] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.004 [MASKS] A(Pass/Fail): 0/2048 | B: 74/1974 | C: 202/1846 [LOSS Ex1] A: 0.00000 | B: 0.68896 | C: 0.68374 [LOGITS Ex2 A] Mean Abs: 0.655 | Max: 3.255 [LOSS Ex2] A: 0.46883 | B: 0.49422 | C: 0.46354 ** [JOINT LOSS] ** : 0.933097 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001111 | Grad Max: 0.026452 -> Layer: shared_layers.0.bias | Grad Mean: 0.010104 | Grad Max: 0.066075 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001162 | Grad Max: 0.005383 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013552 | Grad Max: 0.013552 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000127 | Grad Max: 0.011984 -> Layer: exit2_layers.0.bias | Grad Mean: 0.001860 | Grad Max: 0.068349 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000024 | Grad Max: 0.001347 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000477 | Grad Max: 0.005588 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000220 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000111 | Grad Max: 0.000948 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000165 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000052 | Grad Max: 0.000435 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000491 | Grad Max: 0.001696 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000122 | Grad Max: 0.000122 [GRADIENT NORM TOTAL] 0.2631 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.171 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122877 0.4877123] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.004 [MASKS] A(Pass/Fail): 1/2047 | B: 64/1984 | C: 192/1856 [LOSS Ex1] A: 0.68682 | B: 0.68909 | C: 0.68516 [LOGITS Ex2 A] Mean Abs: 0.694 | Max: 3.402 [LOSS Ex2] A: 0.47415 | B: 0.48372 | C: 0.48136 ** [JOINT LOSS] ** : 1.166768 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001194 | Grad Max: 0.026759 -> Layer: shared_layers.0.bias | Grad Mean: 0.021748 | Grad Max: 0.134026 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.010736 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022208 | Grad Max: 0.022208 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000207 | Grad Max: 0.020800 -> Layer: exit2_layers.0.bias | Grad Mean: 0.003633 | Grad Max: 0.115252 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.001743 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001421 | Grad Max: 0.007100 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000284 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000395 | Grad Max: 0.001468 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000207 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000180 | Grad Max: 0.000831 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000506 | Grad Max: 0.002078 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006018 | Grad Max: 0.006018 [GRADIENT NORM TOTAL] 0.4710 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.023 | Max: 0.191 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5111625 0.48883748] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 62/1986 | C: 181/1867 [LOSS Ex1] A: 0.68723 | B: 0.68761 | C: 0.68385 [LOGITS Ex2 A] Mean Abs: 0.679 | Max: 3.656 [LOSS Ex2] A: 0.47307 | B: 0.47886 | C: 0.49758 ** [JOINT LOSS] ** : 1.169400 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002018 | Grad Max: 0.060058 -> Layer: shared_layers.0.bias | Grad Mean: 0.020374 | Grad Max: 0.080368 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001766 | Grad Max: 0.009279 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011004 | Grad Max: 0.011004 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000292 | Grad Max: 0.016634 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004606 | Grad Max: 0.075836 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.002079 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001788 | Grad Max: 0.008455 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000335 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000528 | Grad Max: 0.001813 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000257 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000251 | Grad Max: 0.000930 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001123 | Grad Max: 0.003302 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011497 | Grad Max: 0.011497 [GRADIENT NORM TOTAL] 0.5580 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51477134 0.4852286 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.004 [MASKS] A(Pass/Fail): 0/2048 | B: 73/1783 | C: 183/1865 [LOSS Ex1] A: 0.00000 | B: 0.68908 | C: 0.68472 [LOGITS Ex2 A] Mean Abs: 0.692 | Max: 3.349 [LOSS Ex2] A: 0.46862 | B: 0.46493 | C: 0.48307 ** [JOINT LOSS] ** : 0.930140 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001559 | Grad Max: 0.037249 -> Layer: shared_layers.0.bias | Grad Mean: 0.012560 | Grad Max: 0.057478 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001150 | Grad Max: 0.005001 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017278 | Grad Max: 0.017278 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000186 | Grad Max: 0.018937 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002428 | Grad Max: 0.104345 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000030 | Grad Max: 0.001374 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000524 | Grad Max: 0.005585 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000246 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000135 | Grad Max: 0.000846 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000154 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000061 | Grad Max: 0.000451 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000497 | Grad Max: 0.002270 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002091 | Grad Max: 0.002091 [GRADIENT NORM TOTAL] 0.3650 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.106 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50404954 0.49595043] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.004 [MASKS] A(Pass/Fail): 0/1616 | B: 74/1974 | C: 187/1861 [LOSS Ex1] A: 0.00000 | B: 0.68877 | C: 0.68429 [LOGITS Ex2 A] Mean Abs: 0.724 | Max: 3.327 [LOSS Ex2] A: 0.46005 | B: 0.49160 | C: 0.48888 ** [JOINT LOSS] ** : 0.937865 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001127 | Grad Max: 0.023882 -> Layer: shared_layers.0.bias | Grad Mean: 0.016781 | Grad Max: 0.108778 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001085 | Grad Max: 0.004985 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011972 | Grad Max: 0.011972 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000170 | Grad Max: 0.013183 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002926 | Grad Max: 0.074350 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000040 | Grad Max: 0.002027 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001083 | Grad Max: 0.009681 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000268 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000287 | Grad Max: 0.001446 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000238 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000130 | Grad Max: 0.000880 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000367 | Grad Max: 0.001758 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004081 | Grad Max: 0.004081 [GRADIENT NORM TOTAL] 0.3700 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.192 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5138496 0.4861504] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.004 [MASKS] A(Pass/Fail): 1/2047 | B: 65/1983 | C: 189/1859 [LOSS Ex1] A: 0.68635 | B: 0.68891 | C: 0.68549 [LOGITS Ex2 A] Mean Abs: 0.718 | Max: 3.439 [LOSS Ex2] A: 0.47115 | B: 0.46979 | C: 0.47567 ** [JOINT LOSS] ** : 1.159119 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001804 | Grad Max: 0.047926 -> Layer: shared_layers.0.bias | Grad Mean: 0.043314 | Grad Max: 0.206790 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001806 | Grad Max: 0.009755 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016387 | Grad Max: 0.016387 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000418 | Grad Max: 0.015573 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007490 | Grad Max: 0.084449 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.003004 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003255 | Grad Max: 0.012216 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000490 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000929 | Grad Max: 0.002720 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000346 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000426 | Grad Max: 0.001447 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001422 | Grad Max: 0.003888 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015619 | Grad Max: 0.015619 [GRADIENT NORM TOTAL] 0.8769 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.191 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084026 0.49159738] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.004 [MASKS] A(Pass/Fail): 1/2047 | B: 63/1985 | C: 186/1862 [LOSS Ex1] A: 0.68643 | B: 0.68741 | C: 0.68346 [LOGITS Ex2 A] Mean Abs: 0.694 | Max: 3.552 [LOSS Ex2] A: 0.47876 | B: 0.46879 | C: 0.49046 ** [JOINT LOSS] ** : 1.165101 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002340 | Grad Max: 0.059492 -> Layer: shared_layers.0.bias | Grad Mean: 0.093376 | Grad Max: 0.563302 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002012 | Grad Max: 0.010911 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017346 | Grad Max: 0.017346 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000881 | Grad Max: 0.035841 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016203 | Grad Max: 0.189919 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000249 | Grad Max: 0.004881 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007829 | Grad Max: 0.024360 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000845 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002230 | Grad Max: 0.005557 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000651 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001036 | Grad Max: 0.003085 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003767 | Grad Max: 0.008241 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042664 | Grad Max: 0.042664 [GRADIENT NORM TOTAL] 1.9353 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.195 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5129538 0.48704612] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 75/1781 | C: 200/1848 [LOSS Ex1] A: 0.68546 | B: 0.68889 | C: 0.68272 [LOGITS Ex2 A] Mean Abs: 0.687 | Max: 3.395 [LOSS Ex2] A: 0.46267 | B: 0.46106 | C: 0.47111 ** [JOINT LOSS] ** : 1.150635 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001714 | Grad Max: 0.047814 -> Layer: shared_layers.0.bias | Grad Mean: 0.073308 | Grad Max: 0.432125 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001981 | Grad Max: 0.010772 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014062 | Grad Max: 0.014062 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000657 | Grad Max: 0.024359 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011707 | Grad Max: 0.134496 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000172 | Grad Max: 0.003845 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005429 | Grad Max: 0.016937 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000614 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001530 | Grad Max: 0.004014 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000488 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000706 | Grad Max: 0.002386 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002468 | Grad Max: 0.007572 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029306 | Grad Max: 0.029306 [GRADIENT NORM TOTAL] 1.4633 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.176 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5141448 0.48585525] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 2/2046 | B: 74/1974 | C: 192/1856 [LOSS Ex1] A: 0.68611 | B: 0.68858 | C: 0.68256 [LOGITS Ex2 A] Mean Abs: 0.734 | Max: 3.529 [LOSS Ex2] A: 0.46413 | B: 0.49553 | C: 0.46041 ** [JOINT LOSS] ** : 1.159107 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002760 | Grad Max: 0.070414 -> Layer: shared_layers.0.bias | Grad Mean: 0.087343 | Grad Max: 0.516504 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001718 | Grad Max: 0.008749 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005541 | Grad Max: 0.005541 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000814 | Grad Max: 0.032581 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015247 | Grad Max: 0.179694 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.004707 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006912 | Grad Max: 0.023590 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000741 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001963 | Grad Max: 0.005083 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000546 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000898 | Grad Max: 0.002672 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003036 | Grad Max: 0.006513 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034935 | Grad Max: 0.034935 [GRADIENT NORM TOTAL] 1.7703 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.138 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50181407 0.4981859 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.004 [MASKS] A(Pass/Fail): 0/2048 | B: 67/1981 | C: 173/1875 [LOSS Ex1] A: 0.00000 | B: 0.68873 | C: 0.68415 [LOGITS Ex2 A] Mean Abs: 0.702 | Max: 3.346 [LOSS Ex2] A: 0.46875 | B: 0.47718 | C: 0.45603 ** [JOINT LOSS] ** : 0.924949 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002791 | Grad Max: 0.074316 -> Layer: shared_layers.0.bias | Grad Mean: 0.085719 | Grad Max: 0.536801 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001099 | Grad Max: 0.005114 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011510 | Grad Max: 0.011510 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000838 | Grad Max: 0.030582 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015447 | Grad Max: 0.169977 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.004361 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007086 | Grad Max: 0.023471 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000779 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002002 | Grad Max: 0.005150 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000549 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000916 | Grad Max: 0.002567 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003117 | Grad Max: 0.006993 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035958 | Grad Max: 0.035958 [GRADIENT NORM TOTAL] 1.7951 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.175 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51424253 0.48575744] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 66/1982 | C: 178/1870 [LOSS Ex1] A: 0.68641 | B: 0.68723 | C: 0.68551 [LOGITS Ex2 A] Mean Abs: 0.716 | Max: 3.689 [LOSS Ex2] A: 0.45461 | B: 0.46808 | C: 0.46917 ** [JOINT LOSS] ** : 1.150337 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001294 | Grad Max: 0.028399 -> Layer: shared_layers.0.bias | Grad Mean: 0.027702 | Grad Max: 0.146661 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.010998 -> Layer: exit1_layers.0.bias | Grad Mean: 0.024301 | Grad Max: 0.024301 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000299 | Grad Max: 0.015701 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005096 | Grad Max: 0.085155 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.003171 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002293 | Grad Max: 0.011854 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000387 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000640 | Grad Max: 0.002047 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000267 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000294 | Grad Max: 0.001263 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001175 | Grad Max: 0.003974 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012939 | Grad Max: 0.012939 [GRADIENT NORM TOTAL] 0.6098 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.195 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105915 0.48940846] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 76/1780 | C: 209/1839 [LOSS Ex1] A: 0.68691 | B: 0.68873 | C: 0.68378 [LOGITS Ex2 A] Mean Abs: 0.717 | Max: 3.417 [LOSS Ex2] A: 0.47141 | B: 0.46335 | C: 0.47832 ** [JOINT LOSS] ** : 1.157498 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002792 | Grad Max: 0.088956 -> Layer: shared_layers.0.bias | Grad Mean: 0.073397 | Grad Max: 0.408091 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001834 | Grad Max: 0.009992 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018235 | Grad Max: 0.018235 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000733 | Grad Max: 0.026108 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013026 | Grad Max: 0.130656 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000199 | Grad Max: 0.003871 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006127 | Grad Max: 0.020543 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000742 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001740 | Grad Max: 0.004384 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000542 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000793 | Grad Max: 0.002497 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002875 | Grad Max: 0.006277 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032277 | Grad Max: 0.032277 [GRADIENT NORM TOTAL] 1.5273 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5164506 0.48354942] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.004 [MASKS] A(Pass/Fail): 0/2048 | B: 77/1971 | C: 184/1864 [LOSS Ex1] A: 0.00000 | B: 0.68842 | C: 0.68585 [LOGITS Ex2 A] Mean Abs: 0.733 | Max: 3.682 [LOSS Ex2] A: 0.45912 | B: 0.48607 | C: 0.46631 ** [JOINT LOSS] ** : 0.928588 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001227 | Grad Max: 0.017235 -> Layer: shared_layers.0.bias | Grad Mean: 0.019766 | Grad Max: 0.112513 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001331 | Grad Max: 0.005052 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022041 | Grad Max: 0.022041 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000201 | Grad Max: 0.013877 -> Layer: exit2_layers.0.bias | Grad Mean: 0.003366 | Grad Max: 0.075387 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.001717 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001086 | Grad Max: 0.007974 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000245 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000275 | Grad Max: 0.001209 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000166 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000123 | Grad Max: 0.000599 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000453 | Grad Max: 0.001686 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004798 | Grad Max: 0.004798 [GRADIENT NORM TOTAL] 0.4242 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.107 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5052409 0.4947591] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.004 [MASKS] A(Pass/Fail): 0/1616 | B: 76/1972 | C: 146/1230 [LOSS Ex1] A: 0.00000 | B: 0.68857 | C: 0.68262 [LOGITS Ex2 A] Mean Abs: 0.772 | Max: 3.828 [LOSS Ex2] A: 0.44620 | B: 0.47798 | C: 0.46226 ** [JOINT LOSS] ** : 0.919210 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001442 | Grad Max: 0.027572 -> Layer: shared_layers.0.bias | Grad Mean: 0.053342 | Grad Max: 0.344007 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001076 | Grad Max: 0.005398 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006955 | Grad Max: 0.006955 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000480 | Grad Max: 0.020292 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009116 | Grad Max: 0.115829 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000131 | Grad Max: 0.003272 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004173 | Grad Max: 0.014280 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000586 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001153 | Grad Max: 0.003087 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000369 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000519 | Grad Max: 0.001595 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001561 | Grad Max: 0.004042 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019281 | Grad Max: 0.019281 [GRADIENT NORM TOTAL] 1.1196 [EPOCH SUMMARY] Train Loss: 1.0608 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.1271 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.1291 -> New: 1.1271) ############################## EPOCH 10/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.197 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5133361 0.4866639] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 72/1976 | C: 194/1854 [LOSS Ex1] A: 0.68600 | B: 0.68705 | C: 0.68479 [LOGITS Ex2 A] Mean Abs: 0.747 | Max: 3.831 [LOSS Ex2] A: 0.45404 | B: 0.45926 | C: 0.47325 ** [JOINT LOSS] ** : 1.148130 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001364 | Grad Max: 0.025969 -> Layer: shared_layers.0.bias | Grad Mean: 0.013280 | Grad Max: 0.061942 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001941 | Grad Max: 0.010563 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017898 | Grad Max: 0.017898 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000188 | Grad Max: 0.011204 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002706 | Grad Max: 0.060313 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000039 | Grad Max: 0.001714 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000994 | Grad Max: 0.007532 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000199 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000269 | Grad Max: 0.001084 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000185 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000123 | Grad Max: 0.000587 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000653 | Grad Max: 0.002778 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006004 | Grad Max: 0.006004 [GRADIENT NORM TOTAL] 0.3528 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.195 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5077994 0.49220058] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 87/1769 | C: 186/1862 [LOSS Ex1] A: 0.68605 | B: 0.68856 | C: 0.68430 [LOGITS Ex2 A] Mean Abs: 0.743 | Max: 3.517 [LOSS Ex2] A: 0.44743 | B: 0.46270 | C: 0.46695 ** [JOINT LOSS] ** : 1.145330 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001474 | Grad Max: 0.030208 -> Layer: shared_layers.0.bias | Grad Mean: 0.032230 | Grad Max: 0.217679 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.011473 -> Layer: exit1_layers.0.bias | Grad Mean: 0.025315 | Grad Max: 0.025315 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000354 | Grad Max: 0.013347 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006141 | Grad Max: 0.072417 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.003009 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002832 | Grad Max: 0.014176 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000378 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000794 | Grad Max: 0.002349 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000310 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000360 | Grad Max: 0.001302 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001383 | Grad Max: 0.003999 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015233 | Grad Max: 0.015233 [GRADIENT NORM TOTAL] 0.7299 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.199 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5123466 0.48765343] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 89/1959 | C: 196/1852 [LOSS Ex1] A: 0.68509 | B: 0.68824 | C: 0.68388 [LOGITS Ex2 A] Mean Abs: 0.748 | Max: 3.794 [LOSS Ex2] A: 0.44872 | B: 0.49099 | C: 0.44997 ** [JOINT LOSS] ** : 1.148964 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001903 | Grad Max: 0.059118 -> Layer: shared_layers.0.bias | Grad Mean: 0.028637 | Grad Max: 0.115661 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001943 | Grad Max: 0.010471 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014613 | Grad Max: 0.014613 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000318 | Grad Max: 0.021059 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005263 | Grad Max: 0.114654 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.002075 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001963 | Grad Max: 0.008975 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000349 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000530 | Grad Max: 0.001910 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000236 | Grad Max: 0.000945 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000730 | Grad Max: 0.002600 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008213 | Grad Max: 0.008213 [GRADIENT NORM TOTAL] 0.6594 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.179 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51357454 0.48642546] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 2/2046 | B: 88/1960 | C: 184/1864 [LOSS Ex1] A: 0.68585 | B: 0.68840 | C: 0.68413 [LOGITS Ex2 A] Mean Abs: 0.726 | Max: 3.530 [LOSS Ex2] A: 0.45699 | B: 0.46714 | C: 0.47432 ** [JOINT LOSS] ** : 1.152276 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001219 | Grad Max: 0.020011 -> Layer: shared_layers.0.bias | Grad Mean: 0.011951 | Grad Max: 0.092182 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001641 | Grad Max: 0.008473 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006319 | Grad Max: 0.006319 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000158 | Grad Max: 0.028019 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002442 | Grad Max: 0.158371 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000026 | Grad Max: 0.001688 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000516 | Grad Max: 0.005438 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000208 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000125 | Grad Max: 0.000972 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000154 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000057 | Grad Max: 0.000452 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000486 | Grad Max: 0.001970 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001763 | Grad Max: 0.001763 [GRADIENT NORM TOTAL] 0.3845 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.024 | Max: 0.138 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009043 0.49909562] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 78/1970 | C: 221/1827 [LOSS Ex1] A: 0.00000 | B: 0.68687 | C: 0.68279 [LOGITS Ex2 A] Mean Abs: 0.713 | Max: 3.564 [LOSS Ex2] A: 0.45425 | B: 0.47095 | C: 0.47844 ** [JOINT LOSS] ** : 0.924433 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001235 | Grad Max: 0.024147 -> Layer: shared_layers.0.bias | Grad Mean: 0.010344 | Grad Max: 0.074168 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001157 | Grad Max: 0.005738 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010122 | Grad Max: 0.010122 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000163 | Grad Max: 0.019803 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002313 | Grad Max: 0.101517 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000029 | Grad Max: 0.001556 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000600 | Grad Max: 0.006855 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000246 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000141 | Grad Max: 0.001169 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000141 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000063 | Grad Max: 0.000424 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000391 | Grad Max: 0.001862 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003211 | Grad Max: 0.003211 [GRADIENT NORM TOTAL] 0.3264 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.179 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5161198 0.48388022] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 90/1766 | C: 224/1824 [LOSS Ex1] A: 0.68602 | B: 0.68838 | C: 0.68205 [LOGITS Ex2 A] Mean Abs: 0.752 | Max: 3.812 [LOSS Ex2] A: 0.44935 | B: 0.45496 | C: 0.46077 ** [JOINT LOSS] ** : 1.140511 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001490 | Grad Max: 0.026572 -> Layer: shared_layers.0.bias | Grad Mean: 0.030728 | Grad Max: 0.182884 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001974 | Grad Max: 0.010803 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019324 | Grad Max: 0.019324 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000308 | Grad Max: 0.021166 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005375 | Grad Max: 0.116976 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.002503 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002270 | Grad Max: 0.010244 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000334 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000605 | Grad Max: 0.001972 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000249 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000265 | Grad Max: 0.001077 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000872 | Grad Max: 0.003017 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009784 | Grad Max: 0.009784 [GRADIENT NORM TOTAL] 0.6627 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.200 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.509994 0.49000606] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 89/1959 | C: 197/1851 [LOSS Ex1] A: 0.68661 | B: 0.68806 | C: 0.68420 [LOGITS Ex2 A] Mean Abs: 0.740 | Max: 3.717 [LOSS Ex2] A: 0.45434 | B: 0.48081 | C: 0.47417 ** [JOINT LOSS] ** : 1.156062 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002657 | Grad Max: 0.074216 -> Layer: shared_layers.0.bias | Grad Mean: 0.076079 | Grad Max: 0.447547 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001826 | Grad Max: 0.009485 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016288 | Grad Max: 0.016288 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000754 | Grad Max: 0.030116 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013695 | Grad Max: 0.159137 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000202 | Grad Max: 0.004444 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006343 | Grad Max: 0.021485 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000662 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001747 | Grad Max: 0.004416 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000536 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000780 | Grad Max: 0.002608 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002789 | Grad Max: 0.006133 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032289 | Grad Max: 0.032289 [GRADIENT NORM TOTAL] 1.5949 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.085 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51812553 0.4818745 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 89/1959 | C: 177/1871 [LOSS Ex1] A: 0.00000 | B: 0.68823 | C: 0.68418 [LOGITS Ex2 A] Mean Abs: 0.745 | Max: 3.669 [LOSS Ex2] A: 0.45280 | B: 0.46137 | C: 0.46218 ** [JOINT LOSS] ** : 0.916252 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001208 | Grad Max: 0.018785 -> Layer: shared_layers.0.bias | Grad Mean: 0.011505 | Grad Max: 0.069626 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001056 | Grad Max: 0.004796 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012734 | Grad Max: 0.012734 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000170 | Grad Max: 0.012804 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002590 | Grad Max: 0.066970 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000035 | Grad Max: 0.001644 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000908 | Grad Max: 0.006190 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000223 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000242 | Grad Max: 0.001161 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000193 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000105 | Grad Max: 0.000646 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000546 | Grad Max: 0.002208 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004485 | Grad Max: 0.004485 [GRADIENT NORM TOTAL] 0.3257 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.107 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5064614 0.4935386] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/1616 | B: 81/1967 | C: 207/1841 [LOSS Ex1] A: 0.00000 | B: 0.68668 | C: 0.68280 [LOGITS Ex2 A] Mean Abs: 0.823 | Max: 3.585 [LOSS Ex2] A: 0.42787 | B: 0.47280 | C: 0.45522 ** [JOINT LOSS] ** : 0.908454 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.051861 -> Layer: shared_layers.0.bias | Grad Mean: 0.083968 | Grad Max: 0.499453 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001103 | Grad Max: 0.005288 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007069 | Grad Max: 0.007069 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000733 | Grad Max: 0.029058 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014010 | Grad Max: 0.153824 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000209 | Grad Max: 0.005177 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006649 | Grad Max: 0.022915 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000770 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001777 | Grad Max: 0.004704 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000434 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000776 | Grad Max: 0.002254 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002381 | Grad Max: 0.005399 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029207 | Grad Max: 0.029207 [GRADIENT NORM TOTAL] 1.7038 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.202 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5128182 0.48718178] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 91/1765 | C: 190/1858 [LOSS Ex1] A: 0.68565 | B: 0.68820 | C: 0.68346 [LOGITS Ex2 A] Mean Abs: 0.783 | Max: 3.681 [LOSS Ex2] A: 0.44157 | B: 0.46114 | C: 0.45594 ** [JOINT LOSS] ** : 1.138656 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001863 | Grad Max: 0.036404 -> Layer: shared_layers.0.bias | Grad Mean: 0.048264 | Grad Max: 0.313620 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001849 | Grad Max: 0.010262 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016002 | Grad Max: 0.016002 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000462 | Grad Max: 0.029540 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008653 | Grad Max: 0.167199 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000125 | Grad Max: 0.003473 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003917 | Grad Max: 0.015261 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000484 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001036 | Grad Max: 0.002923 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000373 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000455 | Grad Max: 0.001676 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001441 | Grad Max: 0.004378 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017561 | Grad Max: 0.017561 [GRADIENT NORM TOTAL] 1.0339 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.200 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50717443 0.49282557] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 91/1957 | C: 197/1851 [LOSS Ex1] A: 0.68567 | B: 0.68788 | C: 0.68540 [LOGITS Ex2 A] Mean Abs: 0.770 | Max: 3.859 [LOSS Ex2] A: 0.44760 | B: 0.49874 | C: 0.47310 ** [JOINT LOSS] ** : 1.159465 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003548 | Grad Max: 0.104352 -> Layer: shared_layers.0.bias | Grad Mean: 0.156232 | Grad Max: 0.959014 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.011222 -> Layer: exit1_layers.0.bias | Grad Mean: 0.026301 | Grad Max: 0.026301 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001443 | Grad Max: 0.056073 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027228 | Grad Max: 0.307445 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000414 | Grad Max: 0.008781 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013226 | Grad Max: 0.041405 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001334 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003549 | Grad Max: 0.008412 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000065 | Grad Max: 0.000903 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001546 | Grad Max: 0.004419 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005087 | Grad Max: 0.010448 -> Layer: exit2_layers.12.bias | Grad Mean: 0.060951 | Grad Max: 0.060951 [GRADIENT NORM TOTAL] 3.2571 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.204 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5117793 0.48822075] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 92/1956 | C: 232/1816 [LOSS Ex1] A: 0.68473 | B: 0.68806 | C: 0.68270 [LOGITS Ex2 A] Mean Abs: 0.769 | Max: 3.975 [LOSS Ex2] A: 0.44994 | B: 0.48762 | C: 0.49775 ** [JOINT LOSS] ** : 1.163594 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003526 | Grad Max: 0.115918 -> Layer: shared_layers.0.bias | Grad Mean: 0.203352 | Grad Max: 1.268148 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001916 | Grad Max: 0.010120 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010959 | Grad Max: 0.010959 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001784 | Grad Max: 0.063381 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034152 | Grad Max: 0.359569 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000520 | Grad Max: 0.011182 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016812 | Grad Max: 0.056261 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000126 | Grad Max: 0.001573 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004479 | Grad Max: 0.010434 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000081 | Grad Max: 0.001096 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001954 | Grad Max: 0.005482 -> Layer: exit2_layers.12.weight | Grad Mean: 0.006391 | Grad Max: 0.012832 -> Layer: exit2_layers.12.bias | Grad Mean: 0.077901 | Grad Max: 0.077901 [GRADIENT NORM TOTAL] 4.1959 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.183 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5130732 0.4869268] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 2/2046 | B: 84/1964 | C: 218/1830 [LOSS Ex1] A: 0.68559 | B: 0.68650 | C: 0.68307 [LOGITS Ex2 A] Mean Abs: 0.756 | Max: 3.874 [LOSS Ex2] A: 0.45297 | B: 0.45377 | C: 0.45457 ** [JOINT LOSS] ** : 1.138829 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002228 | Grad Max: 0.057328 -> Layer: shared_layers.0.bias | Grad Mean: 0.065638 | Grad Max: 0.412930 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001777 | Grad Max: 0.008875 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007432 | Grad Max: 0.007432 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000657 | Grad Max: 0.024165 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011974 | Grad Max: 0.129580 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000179 | Grad Max: 0.004838 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005686 | Grad Max: 0.021343 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000675 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001537 | Grad Max: 0.004156 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000401 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000670 | Grad Max: 0.002016 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002291 | Grad Max: 0.006314 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026883 | Grad Max: 0.026883 [GRADIENT NORM TOTAL] 1.4097 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.139 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000784 0.49992162] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 91/1765 | C: 147/1229 [LOSS Ex1] A: 0.00000 | B: 0.68804 | C: 0.68306 [LOGITS Ex2 A] Mean Abs: 0.828 | Max: 3.737 [LOSS Ex2] A: 0.45484 | B: 0.47834 | C: 0.44592 ** [JOINT LOSS] ** : 0.916735 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006019 | Grad Max: 0.165601 -> Layer: shared_layers.0.bias | Grad Mean: 0.209304 | Grad Max: 1.274282 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001185 | Grad Max: 0.005551 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015166 | Grad Max: 0.015166 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.071834 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037146 | Grad Max: 0.345268 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000556 | Grad Max: 0.012017 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017591 | Grad Max: 0.058004 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000134 | Grad Max: 0.001550 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004677 | Grad Max: 0.011137 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000086 | Grad Max: 0.001138 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002022 | Grad Max: 0.005710 -> Layer: exit2_layers.12.weight | Grad Mean: 0.006541 | Grad Max: 0.014409 -> Layer: exit2_layers.12.bias | Grad Mean: 0.077697 | Grad Max: 0.077697 [GRADIENT NORM TOTAL] 4.3144 [EPOCH SUMMARY] Train Loss: 1.0827 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.1723 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 11/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.183 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51775277 0.4822472 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 92/1956 | C: 217/1831 [LOSS Ex1] A: 0.68568 | B: 0.68773 | C: 0.68273 [LOGITS Ex2 A] Mean Abs: 0.899 | Max: 3.696 [LOSS Ex2] A: 0.49196 | B: 0.52176 | C: 0.46937 ** [JOINT LOSS] ** : 1.179742 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007432 | Grad Max: 0.220869 -> Layer: shared_layers.0.bias | Grad Mean: 0.294796 | Grad Max: 1.818786 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001916 | Grad Max: 0.010113 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015760 | Grad Max: 0.015760 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002733 | Grad Max: 0.105764 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051560 | Grad Max: 0.548172 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000772 | Grad Max: 0.016081 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024593 | Grad Max: 0.077051 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000186 | Grad Max: 0.002244 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006525 | Grad Max: 0.015768 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000118 | Grad Max: 0.001464 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002805 | Grad Max: 0.007749 -> Layer: exit2_layers.12.weight | Grad Mean: 0.009204 | Grad Max: 0.022404 -> Layer: exit2_layers.12.bias | Grad Mean: 0.108507 | Grad Max: 0.108507 [GRADIENT NORM TOTAL] 6.1022 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.204 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5095347 0.49046525] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 95/1953 | C: 190/1858 [LOSS Ex1] A: 0.68635 | B: 0.68792 | C: 0.68505 [LOGITS Ex2 A] Mean Abs: 0.854 | Max: 3.611 [LOSS Ex2] A: 0.47214 | B: 0.49705 | C: 0.48243 ** [JOINT LOSS] ** : 1.170317 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005289 | Grad Max: 0.159863 -> Layer: shared_layers.0.bias | Grad Mean: 0.243136 | Grad Max: 1.496643 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001765 | Grad Max: 0.009411 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016929 | Grad Max: 0.016929 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.081376 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041880 | Grad Max: 0.443695 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000629 | Grad Max: 0.012926 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020254 | Grad Max: 0.064436 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000151 | Grad Max: 0.001847 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005346 | Grad Max: 0.012582 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000096 | Grad Max: 0.001236 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002302 | Grad Max: 0.006359 -> Layer: exit2_layers.12.weight | Grad Mean: 0.007379 | Grad Max: 0.017281 -> Layer: exit2_layers.12.bias | Grad Mean: 0.088368 | Grad Max: 0.088368 [GRADIENT NORM TOTAL] 5.0254 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.085 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51944965 0.48055032] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 85/1963 | C: 210/1838 [LOSS Ex1] A: 0.00000 | B: 0.68636 | C: 0.68456 [LOGITS Ex2 A] Mean Abs: 0.785 | Max: 3.828 [LOSS Ex2] A: 0.43845 | B: 0.44822 | C: 0.45292 ** [JOINT LOSS] ** : 0.903504 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001182 | Grad Max: 0.025647 -> Layer: shared_layers.0.bias | Grad Mean: 0.024016 | Grad Max: 0.124068 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001361 | Grad Max: 0.005784 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022667 | Grad Max: 0.022667 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000226 | Grad Max: 0.014092 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004099 | Grad Max: 0.074218 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.001900 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001739 | Grad Max: 0.007864 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000273 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000440 | Grad Max: 0.001520 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000189 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000190 | Grad Max: 0.000739 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000603 | Grad Max: 0.002320 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006619 | Grad Max: 0.006619 [GRADIENT NORM TOTAL] 0.5075 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.108 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074163 0.49258372] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/1616 | B: 94/1762 | C: 207/1841 [LOSS Ex1] A: 0.00000 | B: 0.68791 | C: 0.68303 [LOGITS Ex2 A] Mean Abs: 0.815 | Max: 3.806 [LOSS Ex2] A: 0.46002 | B: 0.47158 | C: 0.49686 ** [JOINT LOSS] ** : 0.933134 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006456 | Grad Max: 0.174408 -> Layer: shared_layers.0.bias | Grad Mean: 0.264561 | Grad Max: 1.650712 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001190 | Grad Max: 0.005160 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016610 | Grad Max: 0.016610 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002502 | Grad Max: 0.092515 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046601 | Grad Max: 0.504049 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000698 | Grad Max: 0.015983 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022272 | Grad Max: 0.072204 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000167 | Grad Max: 0.002039 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005898 | Grad Max: 0.013480 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000106 | Grad Max: 0.001384 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002554 | Grad Max: 0.007446 -> Layer: exit2_layers.12.weight | Grad Mean: 0.008337 | Grad Max: 0.016748 -> Layer: exit2_layers.12.bias | Grad Mean: 0.100925 | Grad Max: 0.100925 [GRADIENT NORM TOTAL] 5.5279 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.206 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5124308 0.48756918] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 95/1953 | C: 227/1821 [LOSS Ex1] A: 0.68537 | B: 0.68760 | C: 0.68095 [LOGITS Ex2 A] Mean Abs: 0.839 | Max: 3.956 [LOSS Ex2] A: 0.48035 | B: 0.52403 | C: 0.54206 ** [JOINT LOSS] ** : 1.200122 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008331 | Grad Max: 0.253883 -> Layer: shared_layers.0.bias | Grad Mean: 0.345767 | Grad Max: 2.132583 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001871 | Grad Max: 0.009429 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006321 | Grad Max: 0.006321 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003242 | Grad Max: 0.116643 -> Layer: exit2_layers.0.bias | Grad Mean: 0.061228 | Grad Max: 0.645406 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000921 | Grad Max: 0.020877 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029535 | Grad Max: 0.102221 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000219 | Grad Max: 0.002754 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007800 | Grad Max: 0.018314 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000139 | Grad Max: 0.001913 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003383 | Grad Max: 0.009623 -> Layer: exit2_layers.12.weight | Grad Mean: 0.010841 | Grad Max: 0.023629 -> Layer: exit2_layers.12.bias | Grad Mean: 0.133612 | Grad Max: 0.133612 [GRADIENT NORM TOTAL] 7.2372 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.203 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067255 0.4932745] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 98/1950 | C: 192/1856 [LOSS Ex1] A: 0.68540 | B: 0.68780 | C: 0.68416 [LOGITS Ex2 A] Mean Abs: 0.814 | Max: 3.907 [LOSS Ex2] A: 0.46384 | B: 0.49383 | C: 0.49285 ** [JOINT LOSS] ** : 1.169291 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006528 | Grad Max: 0.192062 -> Layer: shared_layers.0.bias | Grad Mean: 0.256931 | Grad Max: 1.582020 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001903 | Grad Max: 0.010326 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017335 | Grad Max: 0.017335 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002440 | Grad Max: 0.090673 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046072 | Grad Max: 0.501599 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000695 | Grad Max: 0.017436 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022252 | Grad Max: 0.080264 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000165 | Grad Max: 0.001894 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005879 | Grad Max: 0.013145 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000104 | Grad Max: 0.001353 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002543 | Grad Max: 0.006924 -> Layer: exit2_layers.12.weight | Grad Mean: 0.008208 | Grad Max: 0.017087 -> Layer: exit2_layers.12.bias | Grad Mean: 0.100520 | Grad Max: 0.100520 [GRADIENT NORM TOTAL] 5.4050 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.207 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5113705 0.48862952] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 87/1961 | C: 205/1843 [LOSS Ex1] A: 0.68447 | B: 0.68624 | C: 0.68339 [LOGITS Ex2 A] Mean Abs: 0.787 | Max: 3.857 [LOSS Ex2] A: 0.43641 | B: 0.45016 | C: 0.45994 ** [JOINT LOSS] ** : 1.133537 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001384 | Grad Max: 0.025742 -> Layer: shared_layers.0.bias | Grad Mean: 0.034403 | Grad Max: 0.181037 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001979 | Grad Max: 0.010472 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013243 | Grad Max: 0.013243 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000363 | Grad Max: 0.024432 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006143 | Grad Max: 0.129939 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.002748 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002677 | Grad Max: 0.013278 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000345 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000703 | Grad Max: 0.002268 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000209 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000301 | Grad Max: 0.000988 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001029 | Grad Max: 0.003848 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012236 | Grad Max: 0.012236 [GRADIENT NORM TOTAL] 0.7577 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.186 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5127044 0.4872957] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 2/2046 | B: 94/1762 | C: 205/1843 [LOSS Ex1] A: 0.68542 | B: 0.68780 | C: 0.68290 [LOGITS Ex2 A] Mean Abs: 0.888 | Max: 3.773 [LOSS Ex2] A: 0.47007 | B: 0.49505 | C: 0.45873 ** [JOINT LOSS] ** : 1.159994 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005568 | Grad Max: 0.147721 -> Layer: shared_layers.0.bias | Grad Mean: 0.264905 | Grad Max: 1.623303 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001716 | Grad Max: 0.008827 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008575 | Grad Max: 0.008575 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002317 | Grad Max: 0.083412 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044196 | Grad Max: 0.471933 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000651 | Grad Max: 0.013522 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021051 | Grad Max: 0.068584 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000154 | Grad Max: 0.001844 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005557 | Grad Max: 0.012710 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000096 | Grad Max: 0.001166 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002391 | Grad Max: 0.006458 -> Layer: exit2_layers.12.weight | Grad Mean: 0.007376 | Grad Max: 0.017046 -> Layer: exit2_layers.12.bias | Grad Mean: 0.091763 | Grad Max: 0.091763 [GRADIENT NORM TOTAL] 5.3618 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.140 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50048566 0.4995143 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 96/1952 | C: 224/1824 [LOSS Ex1] A: 0.00000 | B: 0.68750 | C: 0.68225 [LOGITS Ex2 A] Mean Abs: 0.945 | Max: 3.631 [LOSS Ex2] A: 0.50740 | B: 0.55566 | C: 0.51999 ** [JOINT LOSS] ** : 0.984264 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009171 | Grad Max: 0.254047 -> Layer: shared_layers.0.bias | Grad Mean: 0.373188 | Grad Max: 2.268105 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001136 | Grad Max: 0.005231 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011432 | Grad Max: 0.011432 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003393 | Grad Max: 0.120513 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064149 | Grad Max: 0.624417 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000944 | Grad Max: 0.020404 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030255 | Grad Max: 0.100224 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000222 | Grad Max: 0.002570 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007980 | Grad Max: 0.018242 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000140 | Grad Max: 0.001766 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003435 | Grad Max: 0.009557 -> Layer: exit2_layers.12.weight | Grad Mean: 0.011157 | Grad Max: 0.028839 -> Layer: exit2_layers.12.bias | Grad Mean: 0.133817 | Grad Max: 0.133817 [GRADIENT NORM TOTAL] 7.6036 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.186 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5188811 0.48111892] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 100/1948 | C: 188/1860 [LOSS Ex1] A: 0.68544 | B: 0.68770 | C: 0.68641 [LOGITS Ex2 A] Mean Abs: 0.932 | Max: 3.645 [LOSS Ex2] A: 0.49208 | B: 0.53725 | C: 0.51779 ** [JOINT LOSS] ** : 1.202221 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008743 | Grad Max: 0.259802 -> Layer: shared_layers.0.bias | Grad Mean: 0.384300 | Grad Max: 2.333114 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.011263 -> Layer: exit1_layers.0.bias | Grad Mean: 0.027465 | Grad Max: 0.027465 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003458 | Grad Max: 0.127467 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065471 | Grad Max: 0.711829 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000969 | Grad Max: 0.021459 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031124 | Grad Max: 0.106107 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000228 | Grad Max: 0.002781 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008212 | Grad Max: 0.019112 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000143 | Grad Max: 0.001702 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003538 | Grad Max: 0.009761 -> Layer: exit2_layers.12.weight | Grad Mean: 0.011196 | Grad Max: 0.027571 -> Layer: exit2_layers.12.bias | Grad Mean: 0.136575 | Grad Max: 0.136575 [GRADIENT NORM TOTAL] 7.8462 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.208 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5091831 0.49081695] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 3/2045 | B: 88/1960 | C: 221/1827 [LOSS Ex1] A: 0.68618 | B: 0.68613 | C: 0.68311 [LOGITS Ex2 A] Mean Abs: 0.840 | Max: 3.843 [LOSS Ex2] A: 0.44925 | B: 0.47339 | C: 0.45054 ** [JOINT LOSS] ** : 1.142866 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003184 | Grad Max: 0.089576 -> Layer: shared_layers.0.bias | Grad Mean: 0.177262 | Grad Max: 1.098090 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001743 | Grad Max: 0.008875 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009117 | Grad Max: 0.009117 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001499 | Grad Max: 0.060501 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028958 | Grad Max: 0.343603 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000435 | Grad Max: 0.009946 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014187 | Grad Max: 0.049092 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001328 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003725 | Grad Max: 0.008749 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000063 | Grad Max: 0.000796 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001607 | Grad Max: 0.004516 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004788 | Grad Max: 0.010848 -> Layer: exit2_layers.12.bias | Grad Mean: 0.061249 | Grad Max: 0.061249 [GRADIENT NORM TOTAL] 3.6332 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.085 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52042127 0.47957873] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 96/1760 | C: 237/1811 [LOSS Ex1] A: 0.00000 | B: 0.68771 | C: 0.68163 [LOGITS Ex2 A] Mean Abs: 0.789 | Max: 3.809 [LOSS Ex2] A: 0.43687 | B: 0.45023 | C: 0.45190 ** [JOINT LOSS] ** : 0.902783 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003730 | Grad Max: 0.123758 -> Layer: shared_layers.0.bias | Grad Mean: 0.124059 | Grad Max: 0.764436 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001195 | Grad Max: 0.005330 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013041 | Grad Max: 0.013041 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001227 | Grad Max: 0.063243 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022845 | Grad Max: 0.333691 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.007201 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010642 | Grad Max: 0.035396 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001012 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002833 | Grad Max: 0.006658 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000659 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001226 | Grad Max: 0.003372 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003952 | Grad Max: 0.009789 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048031 | Grad Max: 0.048031 [GRADIENT NORM TOTAL] 2.6572 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.109 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50810415 0.49189588] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/1616 | B: 98/1950 | C: 239/1809 [LOSS Ex1] A: 0.00000 | B: 0.68740 | C: 0.68156 [LOGITS Ex2 A] Mean Abs: 0.828 | Max: 3.712 [LOSS Ex2] A: 0.46606 | B: 0.48997 | C: 0.47843 ** [JOINT LOSS] ** : 0.934474 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006954 | Grad Max: 0.192340 -> Layer: shared_layers.0.bias | Grad Mean: 0.263418 | Grad Max: 1.605785 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001155 | Grad Max: 0.005525 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009388 | Grad Max: 0.009388 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002477 | Grad Max: 0.105378 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046345 | Grad Max: 0.540835 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000684 | Grad Max: 0.017244 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021928 | Grad Max: 0.083481 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000160 | Grad Max: 0.002006 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005807 | Grad Max: 0.013522 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000101 | Grad Max: 0.001346 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002525 | Grad Max: 0.007269 -> Layer: exit2_layers.12.weight | Grad Mean: 0.007987 | Grad Max: 0.015838 -> Layer: exit2_layers.12.bias | Grad Mean: 0.099319 | Grad Max: 0.099319 [GRADIENT NORM TOTAL] 5.4524 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.208 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5121652 0.48783478] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 100/1948 | C: 138/1238 [LOSS Ex1] A: 0.68519 | B: 0.68761 | C: 0.68273 [LOGITS Ex2 A] Mean Abs: 0.829 | Max: 3.998 [LOSS Ex2] A: 0.44233 | B: 0.46888 | C: 0.48549 ** [JOINT LOSS] ** : 1.150741 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005712 | Grad Max: 0.158692 -> Layer: shared_layers.0.bias | Grad Mean: 0.213038 | Grad Max: 1.283029 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001876 | Grad Max: 0.010145 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014387 | Grad Max: 0.014387 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001995 | Grad Max: 0.086474 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037363 | Grad Max: 0.445395 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000545 | Grad Max: 0.012374 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017526 | Grad Max: 0.061047 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000128 | Grad Max: 0.001594 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004657 | Grad Max: 0.010950 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000080 | Grad Max: 0.001051 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002029 | Grad Max: 0.005876 -> Layer: exit2_layers.12.weight | Grad Mean: 0.006369 | Grad Max: 0.013209 -> Layer: exit2_layers.12.bias | Grad Mean: 0.080014 | Grad Max: 0.080014 [GRADIENT NORM TOTAL] 4.3904 [EPOCH SUMMARY] Train Loss: 1.0834 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.1089 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.1271 -> New: 1.1089) ############################## EPOCH 12/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.206 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50640607 0.493594 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 90/1958 | C: 213/1835 [LOSS Ex1] A: 0.68520 | B: 0.68604 | C: 0.68326 [LOGITS Ex2 A] Mean Abs: 0.802 | Max: 3.707 [LOSS Ex2] A: 0.43108 | B: 0.45379 | C: 0.45459 ** [JOINT LOSS] ** : 1.131323 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001815 | Grad Max: 0.042635 -> Layer: shared_layers.0.bias | Grad Mean: 0.028697 | Grad Max: 0.180230 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.010694 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017788 | Grad Max: 0.017788 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000340 | Grad Max: 0.040797 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005643 | Grad Max: 0.234885 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.003379 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002363 | Grad Max: 0.014826 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000379 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000645 | Grad Max: 0.002067 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000266 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000283 | Grad Max: 0.001214 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001089 | Grad Max: 0.003535 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011870 | Grad Max: 0.011870 [GRADIENT NORM TOTAL] 0.7092 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.210 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5110951 0.4889049] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 97/1759 | C: 219/1829 [LOSS Ex1] A: 0.68428 | B: 0.68762 | C: 0.68314 [LOGITS Ex2 A] Mean Abs: 0.863 | Max: 3.666 [LOSS Ex2] A: 0.45505 | B: 0.47324 | C: 0.45412 ** [JOINT LOSS] ** : 1.145818 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006307 | Grad Max: 0.177276 -> Layer: shared_layers.0.bias | Grad Mean: 0.203399 | Grad Max: 1.223315 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002002 | Grad Max: 0.010832 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016064 | Grad Max: 0.016064 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001973 | Grad Max: 0.066316 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036688 | Grad Max: 0.365015 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000537 | Grad Max: 0.012486 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017078 | Grad Max: 0.060265 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000125 | Grad Max: 0.001569 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004529 | Grad Max: 0.010819 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000078 | Grad Max: 0.001081 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001968 | Grad Max: 0.005740 -> Layer: exit2_layers.12.weight | Grad Mean: 0.006026 | Grad Max: 0.013751 -> Layer: exit2_layers.12.bias | Grad Mean: 0.075415 | Grad Max: 0.075415 [GRADIENT NORM TOTAL] 4.2289 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.188 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51245505 0.48754498] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 2/2046 | B: 99/1949 | C: 207/1841 [LOSS Ex1] A: 0.68529 | B: 0.68732 | C: 0.68429 [LOGITS Ex2 A] Mean Abs: 0.894 | Max: 3.482 [LOSS Ex2] A: 0.46663 | B: 0.52036 | C: 0.48151 ** [JOINT LOSS] ** : 1.175133 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006488 | Grad Max: 0.170864 -> Layer: shared_layers.0.bias | Grad Mean: 0.275940 | Grad Max: 1.695373 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001675 | Grad Max: 0.008605 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009216 | Grad Max: 0.009216 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002532 | Grad Max: 0.093223 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048040 | Grad Max: 0.522449 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000701 | Grad Max: 0.015891 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022651 | Grad Max: 0.079529 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000162 | Grad Max: 0.002078 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005988 | Grad Max: 0.014215 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000101 | Grad Max: 0.001234 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002601 | Grad Max: 0.007138 -> Layer: exit2_layers.12.weight | Grad Mean: 0.007884 | Grad Max: 0.018929 -> Layer: exit2_layers.12.bias | Grad Mean: 0.099869 | Grad Max: 0.099869 [GRADIENT NORM TOTAL] 5.6987 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.025 | Max: 0.140 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50089246 0.49910757] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 101/1947 | C: 220/1828 [LOSS Ex1] A: 0.00000 | B: 0.68753 | C: 0.68199 [LOGITS Ex2 A] Mean Abs: 0.825 | Max: 3.778 [LOSS Ex2] A: 0.45229 | B: 0.47766 | C: 0.46765 ** [JOINT LOSS] ** : 0.922374 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005159 | Grad Max: 0.143156 -> Layer: shared_layers.0.bias | Grad Mean: 0.187044 | Grad Max: 1.126397 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001101 | Grad Max: 0.005395 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006424 | Grad Max: 0.006424 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001757 | Grad Max: 0.067053 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033002 | Grad Max: 0.367156 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000482 | Grad Max: 0.010852 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015460 | Grad Max: 0.052853 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001382 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004095 | Grad Max: 0.009503 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000070 | Grad Max: 0.000972 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001781 | Grad Max: 0.005360 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005362 | Grad Max: 0.012571 -> Layer: exit2_layers.12.bias | Grad Mean: 0.067999 | Grad Max: 0.067999 [GRADIENT NORM TOTAL] 3.8925 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.188 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51972604 0.48027393] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 90/1958 | C: 199/1849 [LOSS Ex1] A: 0.68526 | B: 0.68595 | C: 0.68345 [LOGITS Ex2 A] Mean Abs: 0.783 | Max: 3.762 [LOSS Ex2] A: 0.42220 | B: 0.44949 | C: 0.45103 ** [JOINT LOSS] ** : 1.125791 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001523 | Grad Max: 0.027018 -> Layer: shared_layers.0.bias | Grad Mean: 0.011775 | Grad Max: 0.083810 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001968 | Grad Max: 0.010420 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019576 | Grad Max: 0.019576 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000184 | Grad Max: 0.011772 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002606 | Grad Max: 0.065538 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000034 | Grad Max: 0.001977 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000798 | Grad Max: 0.007720 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000188 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000193 | Grad Max: 0.001102 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000150 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000532 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000732 | Grad Max: 0.002138 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002184 | Grad Max: 0.002184 [GRADIENT NORM TOTAL] 0.3434 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.210 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5089323 0.4910677] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 97/1759 | C: 197/1851 [LOSS Ex1] A: 0.68604 | B: 0.68754 | C: 0.68321 [LOGITS Ex2 A] Mean Abs: 0.785 | Max: 3.927 [LOSS Ex2] A: 0.44756 | B: 0.45662 | C: 0.47459 ** [JOINT LOSS] ** : 1.145187 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005047 | Grad Max: 0.144888 -> Layer: shared_layers.0.bias | Grad Mean: 0.203139 | Grad Max: 1.239459 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001775 | Grad Max: 0.009303 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014286 | Grad Max: 0.014286 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001894 | Grad Max: 0.065689 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035384 | Grad Max: 0.369278 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000518 | Grad Max: 0.012840 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016653 | Grad Max: 0.064023 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000119 | Grad Max: 0.001537 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004418 | Grad Max: 0.010564 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000074 | Grad Max: 0.000971 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001928 | Grad Max: 0.005480 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005866 | Grad Max: 0.012857 -> Layer: exit2_layers.12.bias | Grad Mean: 0.075319 | Grad Max: 0.075319 [GRADIENT NORM TOTAL] 4.1927 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.085 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5212006 0.4787994] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 100/1948 | C: 219/1829 [LOSS Ex1] A: 0.00000 | B: 0.68723 | C: 0.68212 [LOGITS Ex2 A] Mean Abs: 0.778 | Max: 3.978 [LOSS Ex2] A: 0.46321 | B: 0.50012 | C: 0.47475 ** [JOINT LOSS] ** : 0.935812 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005955 | Grad Max: 0.167478 -> Layer: shared_layers.0.bias | Grad Mean: 0.257342 | Grad Max: 1.584616 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001145 | Grad Max: 0.005239 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012725 | Grad Max: 0.012725 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002343 | Grad Max: 0.082511 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044214 | Grad Max: 0.442254 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000647 | Grad Max: 0.014175 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020986 | Grad Max: 0.069687 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000149 | Grad Max: 0.001926 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005568 | Grad Max: 0.013106 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000093 | Grad Max: 0.001206 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002439 | Grad Max: 0.006842 -> Layer: exit2_layers.12.weight | Grad Mean: 0.007318 | Grad Max: 0.014900 -> Layer: exit2_layers.12.bias | Grad Mean: 0.095357 | Grad Max: 0.095357 [GRADIENT NORM TOTAL] 5.2928 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.109 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50868213 0.4913179 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/1616 | B: 101/1947 | C: 242/1806 [LOSS Ex1] A: 0.00000 | B: 0.68745 | C: 0.68345 [LOGITS Ex2 A] Mean Abs: 0.790 | Max: 3.806 [LOSS Ex2] A: 0.43150 | B: 0.46573 | C: 0.45664 ** [JOINT LOSS] ** : 0.908255 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003649 | Grad Max: 0.104599 -> Layer: shared_layers.0.bias | Grad Mean: 0.159580 | Grad Max: 0.997957 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001280 | Grad Max: 0.005046 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019372 | Grad Max: 0.019372 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001450 | Grad Max: 0.055166 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027191 | Grad Max: 0.286822 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000400 | Grad Max: 0.009593 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012985 | Grad Max: 0.047538 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003448 | Grad Max: 0.008302 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.000740 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001501 | Grad Max: 0.004252 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004548 | Grad Max: 0.010285 -> Layer: exit2_layers.12.bias | Grad Mean: 0.058409 | Grad Max: 0.058409 [GRADIENT NORM TOTAL] 3.3068 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.211 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5119357 0.48806432] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 91/1957 | C: 227/1821 [LOSS Ex1] A: 0.68502 | B: 0.68585 | C: 0.68210 [LOGITS Ex2 A] Mean Abs: 0.819 | Max: 3.608 [LOSS Ex2] A: 0.42265 | B: 0.44896 | C: 0.45588 ** [JOINT LOSS] ** : 1.126823 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002380 | Grad Max: 0.071424 -> Layer: shared_layers.0.bias | Grad Mean: 0.064190 | Grad Max: 0.371426 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001946 | Grad Max: 0.010179 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012553 | Grad Max: 0.012553 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000617 | Grad Max: 0.031395 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011624 | Grad Max: 0.157595 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.004217 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005244 | Grad Max: 0.019545 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000551 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001374 | Grad Max: 0.003705 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000376 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000602 | Grad Max: 0.001882 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001635 | Grad Max: 0.004727 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021806 | Grad Max: 0.021806 [GRADIENT NORM TOTAL] 1.3540 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.208 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50611496 0.4938851 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 1/2047 | B: 98/1758 | C: 216/1832 [LOSS Ex1] A: 0.68502 | B: 0.68745 | C: 0.68325 [LOGITS Ex2 A] Mean Abs: 0.820 | Max: 3.687 [LOSS Ex2] A: 0.43436 | B: 0.45405 | C: 0.45473 ** [JOINT LOSS] ** : 1.132953 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003532 | Grad Max: 0.097544 -> Layer: shared_layers.0.bias | Grad Mean: 0.126056 | Grad Max: 0.779230 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.010856 -> Layer: exit1_layers.0.bias | Grad Mean: 0.021792 | Grad Max: 0.021792 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001150 | Grad Max: 0.052640 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021886 | Grad Max: 0.263083 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000308 | Grad Max: 0.008051 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009967 | Grad Max: 0.037687 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000920 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002635 | Grad Max: 0.006442 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000593 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001152 | Grad Max: 0.003251 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003373 | Grad Max: 0.007502 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044238 | Grad Max: 0.044238 [GRADIENT NORM TOTAL] 2.5888 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.212 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108243 0.48917568] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 102/1946 | C: 214/1834 [LOSS Ex1] A: 0.68410 | B: 0.68715 | C: 0.68267 [LOGITS Ex2 A] Mean Abs: 0.807 | Max: 3.486 [LOSS Ex2] A: 0.42864 | B: 0.47492 | C: 0.45249 ** [JOINT LOSS] ** : 1.136655 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.065982 -> Layer: shared_layers.0.bias | Grad Mean: 0.039330 | Grad Max: 0.237102 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.010601 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013264 | Grad Max: 0.013264 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000394 | Grad Max: 0.036099 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007185 | Grad Max: 0.201627 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000099 | Grad Max: 0.003102 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003056 | Grad Max: 0.013530 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000396 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000805 | Grad Max: 0.002479 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000243 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000346 | Grad Max: 0.001325 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000878 | Grad Max: 0.003252 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011850 | Grad Max: 0.011850 [GRADIENT NORM TOTAL] 0.8785 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.189 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122033 0.4877967] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 2/2046 | B: 101/1947 | C: 224/1824 [LOSS Ex1] A: 0.68515 | B: 0.68736 | C: 0.68211 [LOGITS Ex2 A] Mean Abs: 0.772 | Max: 3.837 [LOSS Ex2] A: 0.45123 | B: 0.46160 | C: 0.46935 ** [JOINT LOSS] ** : 1.145597 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004480 | Grad Max: 0.117355 -> Layer: shared_layers.0.bias | Grad Mean: 0.139012 | Grad Max: 0.827945 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001743 | Grad Max: 0.008747 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007271 | Grad Max: 0.007271 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001357 | Grad Max: 0.049116 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025145 | Grad Max: 0.259510 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.008793 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011632 | Grad Max: 0.045624 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.001169 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003097 | Grad Max: 0.007766 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000627 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001354 | Grad Max: 0.003759 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004204 | Grad Max: 0.009471 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053764 | Grad Max: 0.053764 [GRADIENT NORM TOTAL] 2.8937 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.141 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013458 0.49865425] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 93/1955 | C: 210/1838 [LOSS Ex1] A: 0.00000 | B: 0.68575 | C: 0.68358 [LOGITS Ex2 A] Mean Abs: 0.758 | Max: 3.855 [LOSS Ex2] A: 0.44594 | B: 0.44928 | C: 0.47515 ** [JOINT LOSS] ** : 0.913236 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004655 | Grad Max: 0.130096 -> Layer: shared_layers.0.bias | Grad Mean: 0.159138 | Grad Max: 0.974874 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001135 | Grad Max: 0.005306 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011483 | Grad Max: 0.011483 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001519 | Grad Max: 0.056309 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028423 | Grad Max: 0.296868 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000405 | Grad Max: 0.010114 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013114 | Grad Max: 0.044648 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001202 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003488 | Grad Max: 0.008137 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000058 | Grad Max: 0.000724 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001524 | Grad Max: 0.004058 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004639 | Grad Max: 0.010389 -> Layer: exit2_layers.12.bias | Grad Mean: 0.060052 | Grad Max: 0.060052 [GRADIENT NORM TOTAL] 3.2975 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.190 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5206863 0.47931367] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 101/1755 | C: 149/1227 [LOSS Ex1] A: 0.68504 | B: 0.68735 | C: 0.68117 [LOGITS Ex2 A] Mean Abs: 0.812 | Max: 3.779 [LOSS Ex2] A: 0.42423 | B: 0.44758 | C: 0.46680 ** [JOINT LOSS] ** : 1.130723 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.063930 -> Layer: shared_layers.0.bias | Grad Mean: 0.045409 | Grad Max: 0.233599 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001939 | Grad Max: 0.010126 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013499 | Grad Max: 0.013499 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000476 | Grad Max: 0.020711 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008549 | Grad Max: 0.107756 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000116 | Grad Max: 0.002940 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003695 | Grad Max: 0.014620 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000394 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000982 | Grad Max: 0.002733 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000294 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000431 | Grad Max: 0.001618 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001415 | Grad Max: 0.004352 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017559 | Grad Max: 0.017559 [GRADIENT NORM TOTAL] 0.9970 [EPOCH SUMMARY] Train Loss: 1.0768 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.1278 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 13/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.213 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50866747 0.49133256] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 107/1941 | C: 224/1824 [LOSS Ex1] A: 0.68587 | B: 0.68704 | C: 0.68206 [LOGITS Ex2 A] Mean Abs: 0.851 | Max: 3.649 [LOSS Ex2] A: 0.45136 | B: 0.50492 | C: 0.44847 ** [JOINT LOSS] ** : 1.153240 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003661 | Grad Max: 0.101209 -> Layer: shared_layers.0.bias | Grad Mean: 0.198608 | Grad Max: 1.235167 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001828 | Grad Max: 0.009640 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014396 | Grad Max: 0.014396 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001701 | Grad Max: 0.069071 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033025 | Grad Max: 0.401876 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000485 | Grad Max: 0.011006 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015990 | Grad Max: 0.056619 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000109 | Grad Max: 0.001382 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004210 | Grad Max: 0.010112 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000067 | Grad Max: 0.000889 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001829 | Grad Max: 0.004942 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005259 | Grad Max: 0.012141 -> Layer: exit2_layers.12.bias | Grad Mean: 0.069890 | Grad Max: 0.069890 [GRADIENT NORM TOTAL] 4.1065 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52209735 0.47790262] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.005 [MASKS] A(Pass/Fail): 0/2048 | B: 105/1943 | C: 215/1833 [LOSS Ex1] A: 0.00000 | B: 0.68726 | C: 0.68221 [LOGITS Ex2 A] Mean Abs: 0.902 | Max: 3.745 [LOSS Ex2] A: 0.45396 | B: 0.49609 | C: 0.47111 ** [JOINT LOSS] ** : 0.930208 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005543 | Grad Max: 0.151161 -> Layer: shared_layers.0.bias | Grad Mean: 0.250835 | Grad Max: 1.557162 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001049 | Grad Max: 0.005431 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007039 | Grad Max: 0.007039 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002251 | Grad Max: 0.093746 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042921 | Grad Max: 0.492413 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000620 | Grad Max: 0.013350 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020214 | Grad Max: 0.070463 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000140 | Grad Max: 0.001694 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005340 | Grad Max: 0.012424 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000086 | Grad Max: 0.001099 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002322 | Grad Max: 0.006453 -> Layer: exit2_layers.12.weight | Grad Mean: 0.006738 | Grad Max: 0.016167 -> Layer: exit2_layers.12.bias | Grad Mean: 0.088769 | Grad Max: 0.088769 [GRADIENT NORM TOTAL] 5.2238 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.110 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50935894 0.49064106] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 0/1616 | B: 100/1948 | C: 223/1825 [LOSS Ex1] A: 0.00000 | B: 0.68565 | C: 0.68385 [LOGITS Ex2 A] Mean Abs: 0.903 | Max: 3.516 [LOSS Ex2] A: 0.42420 | B: 0.47981 | C: 0.46850 ** [JOINT LOSS] ** : 0.914000 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004683 | Grad Max: 0.123911 -> Layer: shared_layers.0.bias | Grad Mean: 0.219863 | Grad Max: 1.337774 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001400 | Grad Max: 0.005949 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022268 | Grad Max: 0.022268 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001935 | Grad Max: 0.078084 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037036 | Grad Max: 0.441926 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000538 | Grad Max: 0.010998 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017530 | Grad Max: 0.056661 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000121 | Grad Max: 0.001537 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004610 | Grad Max: 0.010907 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000074 | Grad Max: 0.000936 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002002 | Grad Max: 0.005594 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005767 | Grad Max: 0.013277 -> Layer: exit2_layers.12.bias | Grad Mean: 0.076761 | Grad Max: 0.076761 [GRADIENT NORM TOTAL] 4.5088 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.214 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.511722 0.488278] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 105/1751 | C: 200/1848 [LOSS Ex1] A: 0.68482 | B: 0.68726 | C: 0.68344 [LOGITS Ex2 A] Mean Abs: 0.820 | Max: 3.692 [LOSS Ex2] A: 0.42372 | B: 0.43989 | C: 0.47711 ** [JOINT LOSS] ** : 1.132078 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001230 | Grad Max: 0.018586 -> Layer: shared_layers.0.bias | Grad Mean: 0.036908 | Grad Max: 0.216045 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001880 | Grad Max: 0.010088 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017026 | Grad Max: 0.017026 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000342 | Grad Max: 0.013227 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006210 | Grad Max: 0.072556 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.002868 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002752 | Grad Max: 0.013271 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000338 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000714 | Grad Max: 0.002348 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000230 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000311 | Grad Max: 0.001197 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000804 | Grad Max: 0.002958 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011242 | Grad Max: 0.011242 [GRADIENT NORM TOTAL] 0.7704 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.210 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50578946 0.49421057] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 112/1936 | C: 220/1828 [LOSS Ex1] A: 0.68479 | B: 0.68695 | C: 0.68366 [LOGITS Ex2 A] Mean Abs: 0.813 | Max: 3.877 [LOSS Ex2] A: 0.44784 | B: 0.48633 | C: 0.48231 ** [JOINT LOSS] ** : 1.157295 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004861 | Grad Max: 0.139294 -> Layer: shared_layers.0.bias | Grad Mean: 0.221011 | Grad Max: 1.358294 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.011085 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022671 | Grad Max: 0.022671 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002006 | Grad Max: 0.074608 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037663 | Grad Max: 0.405886 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000540 | Grad Max: 0.014316 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017624 | Grad Max: 0.066064 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000121 | Grad Max: 0.001574 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004654 | Grad Max: 0.010870 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000075 | Grad Max: 0.000988 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002037 | Grad Max: 0.005871 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005996 | Grad Max: 0.012430 -> Layer: exit2_layers.12.bias | Grad Mean: 0.079544 | Grad Max: 0.079544 [GRADIENT NORM TOTAL] 4.5278 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.215 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105478 0.4894522] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 113/1935 | C: 218/1830 [LOSS Ex1] A: 0.68388 | B: 0.68716 | C: 0.68358 [LOGITS Ex2 A] Mean Abs: 0.822 | Max: 4.176 [LOSS Ex2] A: 0.46598 | B: 0.50427 | C: 0.48672 ** [JOINT LOSS] ** : 1.170530 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006863 | Grad Max: 0.190868 -> Layer: shared_layers.0.bias | Grad Mean: 0.313434 | Grad Max: 1.939338 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001918 | Grad Max: 0.010041 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014809 | Grad Max: 0.014809 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002782 | Grad Max: 0.106735 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053059 | Grad Max: 0.559461 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000767 | Grad Max: 0.018961 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025138 | Grad Max: 0.089918 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000172 | Grad Max: 0.002224 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006626 | Grad Max: 0.015590 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000106 | Grad Max: 0.001315 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002903 | Grad Max: 0.008089 -> Layer: exit2_layers.12.weight | Grad Mean: 0.008459 | Grad Max: 0.017960 -> Layer: exit2_layers.12.bias | Grad Mean: 0.114752 | Grad Max: 0.114752 [GRADIENT NORM TOTAL] 6.4081 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.192 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5119585 0.48804152] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 2/2046 | B: 102/1946 | C: 227/1821 [LOSS Ex1] A: 0.68499 | B: 0.68554 | C: 0.68381 [LOGITS Ex2 A] Mean Abs: 0.779 | Max: 3.875 [LOSS Ex2] A: 0.46735 | B: 0.47923 | C: 0.46638 ** [JOINT LOSS] ** : 1.155766 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006310 | Grad Max: 0.181320 -> Layer: shared_layers.0.bias | Grad Mean: 0.269654 | Grad Max: 1.636061 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001841 | Grad Max: 0.009389 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014638 | Grad Max: 0.014638 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002437 | Grad Max: 0.086809 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046006 | Grad Max: 0.487098 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000662 | Grad Max: 0.015199 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021613 | Grad Max: 0.078028 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000149 | Grad Max: 0.001827 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005698 | Grad Max: 0.013202 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000092 | Grad Max: 0.001157 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002488 | Grad Max: 0.006898 -> Layer: exit2_layers.12.weight | Grad Mean: 0.007320 | Grad Max: 0.014920 -> Layer: exit2_layers.12.bias | Grad Mean: 0.097530 | Grad Max: 0.097530 [GRADIENT NORM TOTAL] 5.5030 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.026 | Max: 0.142 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5018524 0.49814767] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 107/1749 | C: 217/1831 [LOSS Ex1] A: 0.00000 | B: 0.68716 | C: 0.68164 [LOGITS Ex2 A] Mean Abs: 0.737 | Max: 3.724 [LOSS Ex2] A: 0.42436 | B: 0.44585 | C: 0.45967 ** [JOINT LOSS] ** : 0.899563 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002730 | Grad Max: 0.071032 -> Layer: shared_layers.0.bias | Grad Mean: 0.130315 | Grad Max: 0.816033 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001160 | Grad Max: 0.005277 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010099 | Grad Max: 0.010099 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001158 | Grad Max: 0.049153 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021901 | Grad Max: 0.263830 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000320 | Grad Max: 0.007655 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010524 | Grad Max: 0.037565 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000930 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002772 | Grad Max: 0.006862 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000556 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001201 | Grad Max: 0.003410 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003558 | Grad Max: 0.008716 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046569 | Grad Max: 0.046569 [GRADIENT NORM TOTAL] 2.6963 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.193 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5216402 0.47835988] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 114/1934 | C: 227/1821 [LOSS Ex1] A: 0.68484 | B: 0.68685 | C: 0.68207 [LOGITS Ex2 A] Mean Abs: 0.836 | Max: 3.727 [LOSS Ex2] A: 0.43711 | B: 0.48182 | C: 0.46025 ** [JOINT LOSS] ** : 1.144317 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005706 | Grad Max: 0.180257 -> Layer: shared_layers.0.bias | Grad Mean: 0.161535 | Grad Max: 0.935794 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001915 | Grad Max: 0.010117 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013690 | Grad Max: 0.013690 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001584 | Grad Max: 0.059072 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029362 | Grad Max: 0.296301 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.009199 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013141 | Grad Max: 0.049463 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001246 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003478 | Grad Max: 0.008547 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.000768 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001512 | Grad Max: 0.004492 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004392 | Grad Max: 0.009275 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057471 | Grad Max: 0.057471 [GRADIENT NORM TOTAL] 3.3693 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.216 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083936 0.49160644] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 115/1933 | C: 231/1817 [LOSS Ex1] A: 0.68572 | B: 0.68708 | C: 0.68073 [LOGITS Ex2 A] Mean Abs: 0.888 | Max: 3.399 [LOSS Ex2] A: 0.46682 | B: 0.48727 | C: 0.46756 ** [JOINT LOSS] ** : 1.158392 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006869 | Grad Max: 0.206397 -> Layer: shared_layers.0.bias | Grad Mean: 0.283724 | Grad Max: 1.706938 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001821 | Grad Max: 0.009194 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009682 | Grad Max: 0.009682 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002570 | Grad Max: 0.098984 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049026 | Grad Max: 0.559611 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000689 | Grad Max: 0.014997 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022494 | Grad Max: 0.078575 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000154 | Grad Max: 0.001976 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005924 | Grad Max: 0.014101 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000095 | Grad Max: 0.001183 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002576 | Grad Max: 0.006993 -> Layer: exit2_layers.12.weight | Grad Mean: 0.007432 | Grad Max: 0.017181 -> Layer: exit2_layers.12.bias | Grad Mean: 0.098482 | Grad Max: 0.098482 [GRADIENT NORM TOTAL] 5.8148 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5229239 0.4770761] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 109/1939 | C: 220/1828 [LOSS Ex1] A: 0.00000 | B: 0.68545 | C: 0.68153 [LOGITS Ex2 A] Mean Abs: 0.889 | Max: 3.766 [LOSS Ex2] A: 0.45225 | B: 0.47664 | C: 0.44827 ** [JOINT LOSS] ** : 0.914716 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006391 | Grad Max: 0.172901 -> Layer: shared_layers.0.bias | Grad Mean: 0.243179 | Grad Max: 1.462826 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001145 | Grad Max: 0.005549 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003044 | Grad Max: 0.003044 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002219 | Grad Max: 0.085078 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041697 | Grad Max: 0.466138 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000579 | Grad Max: 0.012664 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018867 | Grad Max: 0.064823 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000129 | Grad Max: 0.001694 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004989 | Grad Max: 0.012402 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000079 | Grad Max: 0.000982 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002171 | Grad Max: 0.005962 -> Layer: exit2_layers.12.weight | Grad Mean: 0.006256 | Grad Max: 0.014290 -> Layer: exit2_layers.12.bias | Grad Mean: 0.082915 | Grad Max: 0.082915 [GRADIENT NORM TOTAL] 4.9423 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.110 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5099776 0.4900224] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/1616 | B: 110/1746 | C: 219/1829 [LOSS Ex1] A: 0.00000 | B: 0.68708 | C: 0.68260 [LOGITS Ex2 A] Mean Abs: 0.852 | Max: 3.680 [LOSS Ex2] A: 0.41525 | B: 0.43874 | C: 0.43307 ** [JOINT LOSS] ** : 0.885577 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003399 | Grad Max: 0.088965 -> Layer: shared_layers.0.bias | Grad Mean: 0.104238 | Grad Max: 0.633965 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001122 | Grad Max: 0.005241 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013392 | Grad Max: 0.013392 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000975 | Grad Max: 0.042724 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018482 | Grad Max: 0.219247 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000259 | Grad Max: 0.006552 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008410 | Grad Max: 0.031115 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000764 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002213 | Grad Max: 0.005451 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000545 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000966 | Grad Max: 0.003070 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002588 | Grad Max: 0.006325 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036158 | Grad Max: 0.036158 [GRADIENT NORM TOTAL] 2.1506 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.216 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51150197 0.48849797] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 117/1931 | C: 214/1834 [LOSS Ex1] A: 0.68466 | B: 0.68677 | C: 0.68260 [LOGITS Ex2 A] Mean Abs: 0.803 | Max: 3.830 [LOSS Ex2] A: 0.42544 | B: 0.47536 | C: 0.46184 ** [JOINT LOSS] ** : 1.138886 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003784 | Grad Max: 0.095331 -> Layer: shared_layers.0.bias | Grad Mean: 0.149209 | Grad Max: 0.897146 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001878 | Grad Max: 0.009959 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014470 | Grad Max: 0.014470 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001378 | Grad Max: 0.055358 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025743 | Grad Max: 0.306408 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.008053 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011958 | Grad Max: 0.039609 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.001108 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003163 | Grad Max: 0.007804 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000695 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001384 | Grad Max: 0.003995 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004096 | Grad Max: 0.009501 -> Layer: exit2_layers.12.bias | Grad Mean: 0.054172 | Grad Max: 0.054172 [GRADIENT NORM TOTAL] 3.0632 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.212 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50549144 0.49450853] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 118/1930 | C: 144/1232 [LOSS Ex1] A: 0.68462 | B: 0.68700 | C: 0.68231 [LOGITS Ex2 A] Mean Abs: 0.804 | Max: 3.758 [LOSS Ex2] A: 0.44342 | B: 0.47390 | C: 0.46535 ** [JOINT LOSS] ** : 1.145531 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005507 | Grad Max: 0.152996 -> Layer: shared_layers.0.bias | Grad Mean: 0.247635 | Grad Max: 1.501233 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001958 | Grad Max: 0.010114 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015511 | Grad Max: 0.015511 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.084958 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041942 | Grad Max: 0.477203 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000599 | Grad Max: 0.014287 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019690 | Grad Max: 0.064640 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000133 | Grad Max: 0.001805 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005199 | Grad Max: 0.012320 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000081 | Grad Max: 0.001054 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002272 | Grad Max: 0.006416 -> Layer: exit2_layers.12.weight | Grad Mean: 0.006443 | Grad Max: 0.013997 -> Layer: exit2_layers.12.bias | Grad Mean: 0.088202 | Grad Max: 0.088202 [GRADIENT NORM TOTAL] 5.0311 [EPOCH SUMMARY] Train Loss: 1.0643 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.1181 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 14/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.217 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51031023 0.48968974] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 110/1938 | C: 205/1843 [LOSS Ex1] A: 0.68372 | B: 0.68537 | C: 0.68327 [LOGITS Ex2 A] Mean Abs: 0.796 | Max: 3.507 [LOSS Ex2] A: 0.43886 | B: 0.44437 | C: 0.46836 ** [JOINT LOSS] ** : 1.134649 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004506 | Grad Max: 0.109250 -> Layer: shared_layers.0.bias | Grad Mean: 0.208138 | Grad Max: 1.270838 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002026 | Grad Max: 0.010681 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014983 | Grad Max: 0.014983 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001832 | Grad Max: 0.069294 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034732 | Grad Max: 0.383120 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000498 | Grad Max: 0.011467 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016395 | Grad Max: 0.056857 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000110 | Grad Max: 0.001470 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004323 | Grad Max: 0.010442 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000068 | Grad Max: 0.000901 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001887 | Grad Max: 0.005301 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005436 | Grad Max: 0.011317 -> Layer: exit2_layers.12.bias | Grad Mean: 0.073409 | Grad Max: 0.073409 [GRADIENT NORM TOTAL] 4.2067 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.194 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5117516 0.4882484] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 2/2046 | B: 113/1743 | C: 218/1830 [LOSS Ex1] A: 0.68488 | B: 0.68700 | C: 0.68421 [LOGITS Ex2 A] Mean Abs: 0.794 | Max: 3.673 [LOSS Ex2] A: 0.42349 | B: 0.44672 | C: 0.44593 ** [JOINT LOSS] ** : 1.124075 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002292 | Grad Max: 0.060297 -> Layer: shared_layers.0.bias | Grad Mean: 0.063031 | Grad Max: 0.359091 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001791 | Grad Max: 0.009413 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015835 | Grad Max: 0.015835 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000609 | Grad Max: 0.023092 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011034 | Grad Max: 0.126433 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000159 | Grad Max: 0.003934 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005107 | Grad Max: 0.017536 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000544 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001361 | Grad Max: 0.003800 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000328 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000587 | Grad Max: 0.001895 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001792 | Grad Max: 0.005263 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022736 | Grad Max: 0.022736 [GRADIENT NORM TOTAL] 1.3083 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.142 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022183 0.4977817] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 122/1926 | C: 216/1832 [LOSS Ex1] A: 0.00000 | B: 0.68669 | C: 0.68220 [LOGITS Ex2 A] Mean Abs: 0.846 | Max: 3.569 [LOSS Ex2] A: 0.43383 | B: 0.48889 | C: 0.45948 ** [JOINT LOSS] ** : 0.917033 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005025 | Grad Max: 0.134538 -> Layer: shared_layers.0.bias | Grad Mean: 0.201530 | Grad Max: 1.220197 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001066 | Grad Max: 0.005151 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008190 | Grad Max: 0.008190 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001790 | Grad Max: 0.070422 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033742 | Grad Max: 0.383758 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000469 | Grad Max: 0.010581 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015325 | Grad Max: 0.053599 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001516 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004048 | Grad Max: 0.010061 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000064 | Grad Max: 0.000849 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001763 | Grad Max: 0.004997 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004978 | Grad Max: 0.011714 -> Layer: exit2_layers.12.bias | Grad Mean: 0.066946 | Grad Max: 0.066946 [GRADIENT NORM TOTAL] 4.0725 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.195 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52235824 0.47764176] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 122/1926 | C: 192/1856 [LOSS Ex1] A: 0.68469 | B: 0.68692 | C: 0.68382 [LOGITS Ex2 A] Mean Abs: 0.931 | Max: 3.794 [LOSS Ex2] A: 0.46877 | B: 0.51097 | C: 0.46460 ** [JOINT LOSS] ** : 1.166592 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007681 | Grad Max: 0.227402 -> Layer: shared_layers.0.bias | Grad Mean: 0.341362 | Grad Max: 2.062739 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001961 | Grad Max: 0.010347 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019807 | Grad Max: 0.019807 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003007 | Grad Max: 0.112736 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056895 | Grad Max: 0.628586 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000798 | Grad Max: 0.017852 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026126 | Grad Max: 0.089403 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000175 | Grad Max: 0.002205 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006883 | Grad Max: 0.016716 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000108 | Grad Max: 0.001408 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003002 | Grad Max: 0.008336 -> Layer: exit2_layers.12.weight | Grad Mean: 0.008699 | Grad Max: 0.020364 -> Layer: exit2_layers.12.bias | Grad Mean: 0.115615 | Grad Max: 0.115615 [GRADIENT NORM TOTAL] 6.8893 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.218 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081861 0.49181384] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 114/1934 | C: 229/1819 [LOSS Ex1] A: 0.68560 | B: 0.68529 | C: 0.67987 [LOGITS Ex2 A] Mean Abs: 0.918 | Max: 3.730 [LOSS Ex2] A: 0.46799 | B: 0.49862 | C: 0.46360 ** [JOINT LOSS] ** : 1.160321 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006540 | Grad Max: 0.185130 -> Layer: shared_layers.0.bias | Grad Mean: 0.322500 | Grad Max: 1.957633 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.009015 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004172 | Grad Max: 0.004172 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002775 | Grad Max: 0.105910 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053222 | Grad Max: 0.600234 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000748 | Grad Max: 0.016179 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024661 | Grad Max: 0.081699 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000164 | Grad Max: 0.002151 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006491 | Grad Max: 0.015790 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000101 | Grad Max: 0.001252 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002831 | Grad Max: 0.007824 -> Layer: exit2_layers.12.weight | Grad Mean: 0.008014 | Grad Max: 0.019399 -> Layer: exit2_layers.12.bias | Grad Mean: 0.107948 | Grad Max: 0.107948 [GRADIENT NORM TOTAL] 6.4957 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52354157 0.4764584 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 118/1738 | C: 213/1835 [LOSS Ex1] A: 0.00000 | B: 0.68692 | C: 0.68292 [LOGITS Ex2 A] Mean Abs: 0.862 | Max: 3.594 [LOSS Ex2] A: 0.42852 | B: 0.46403 | C: 0.45134 ** [JOINT LOSS] ** : 0.904575 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004236 | Grad Max: 0.110307 -> Layer: shared_layers.0.bias | Grad Mean: 0.198989 | Grad Max: 1.197965 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001208 | Grad Max: 0.004975 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016457 | Grad Max: 0.016457 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001691 | Grad Max: 0.065662 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032252 | Grad Max: 0.371374 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000457 | Grad Max: 0.010010 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015066 | Grad Max: 0.049603 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001386 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003965 | Grad Max: 0.009854 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000062 | Grad Max: 0.000807 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001737 | Grad Max: 0.004829 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004929 | Grad Max: 0.010441 -> Layer: exit2_layers.12.bias | Grad Mean: 0.067084 | Grad Max: 0.067084 [GRADIENT NORM TOTAL] 3.9659 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.111 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51045907 0.4895409 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/1616 | B: 123/1925 | C: 226/1822 [LOSS Ex1] A: 0.00000 | B: 0.68662 | C: 0.68302 [LOGITS Ex2 A] Mean Abs: 0.822 | Max: 3.743 [LOSS Ex2] A: 0.41318 | B: 0.46577 | C: 0.44520 ** [JOINT LOSS] ** : 0.897930 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002345 | Grad Max: 0.067573 -> Layer: shared_layers.0.bias | Grad Mean: 0.073075 | Grad Max: 0.442686 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001162 | Grad Max: 0.005136 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014915 | Grad Max: 0.014915 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000726 | Grad Max: 0.031449 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013173 | Grad Max: 0.170650 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000180 | Grad Max: 0.004033 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005834 | Grad Max: 0.020782 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000627 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001550 | Grad Max: 0.004251 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000360 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000679 | Grad Max: 0.002024 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002065 | Grad Max: 0.005202 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026725 | Grad Max: 0.026725 [GRADIENT NORM TOTAL] 1.5344 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.219 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5113187 0.48868135] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 123/1925 | C: 223/1825 [LOSS Ex1] A: 0.68453 | B: 0.68685 | C: 0.68284 [LOGITS Ex2 A] Mean Abs: 0.818 | Max: 3.917 [LOSS Ex2] A: 0.41762 | B: 0.45860 | C: 0.45875 ** [JOINT LOSS] ** : 1.129728 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003966 | Grad Max: 0.104923 -> Layer: shared_layers.0.bias | Grad Mean: 0.141135 | Grad Max: 0.845677 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.009900 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015326 | Grad Max: 0.015326 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001322 | Grad Max: 0.052380 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024817 | Grad Max: 0.289540 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000344 | Grad Max: 0.007901 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011285 | Grad Max: 0.039932 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.001015 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002982 | Grad Max: 0.007221 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000635 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001302 | Grad Max: 0.003648 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003811 | Grad Max: 0.009144 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050725 | Grad Max: 0.050725 [GRADIENT NORM TOTAL] 2.8945 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.214 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.505235 0.49476495] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 115/1933 | C: 237/1811 [LOSS Ex1] A: 0.68448 | B: 0.68521 | C: 0.68193 [LOGITS Ex2 A] Mean Abs: 0.807 | Max: 3.809 [LOSS Ex2] A: 0.41943 | B: 0.45252 | C: 0.45841 ** [JOINT LOSS] ** : 1.127328 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003530 | Grad Max: 0.091850 -> Layer: shared_layers.0.bias | Grad Mean: 0.123736 | Grad Max: 0.748238 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001999 | Grad Max: 0.010457 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014806 | Grad Max: 0.014806 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001175 | Grad Max: 0.046133 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021650 | Grad Max: 0.260669 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000303 | Grad Max: 0.006507 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009874 | Grad Max: 0.030570 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000896 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002624 | Grad Max: 0.006472 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000532 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001148 | Grad Max: 0.003164 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003411 | Grad Max: 0.007891 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045300 | Grad Max: 0.045300 [GRADIENT NORM TOTAL] 2.5480 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.219 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51009023 0.4899098 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 3/2045 | B: 119/1737 | C: 223/1825 [LOSS Ex1] A: 0.68359 | B: 0.68684 | C: 0.68178 [LOGITS Ex2 A] Mean Abs: 0.818 | Max: 3.768 [LOSS Ex2] A: 0.42231 | B: 0.44768 | C: 0.44023 ** [JOINT LOSS] ** : 1.120811 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001926 | Grad Max: 0.050477 -> Layer: shared_layers.0.bias | Grad Mean: 0.030111 | Grad Max: 0.153235 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001911 | Grad Max: 0.009961 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008733 | Grad Max: 0.008733 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000337 | Grad Max: 0.013506 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005878 | Grad Max: 0.075346 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.003644 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002440 | Grad Max: 0.013734 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000325 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000644 | Grad Max: 0.001994 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000223 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000286 | Grad Max: 0.001096 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000839 | Grad Max: 0.003144 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010603 | Grad Max: 0.010603 [GRADIENT NORM TOTAL] 0.6822 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.195 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51154274 0.48845732] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 2/2046 | B: 123/1925 | C: 235/1813 [LOSS Ex1] A: 0.68479 | B: 0.68654 | C: 0.68167 [LOGITS Ex2 A] Mean Abs: 0.823 | Max: 3.897 [LOSS Ex2] A: 0.41593 | B: 0.47618 | C: 0.45664 ** [JOINT LOSS] ** : 1.133914 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002424 | Grad Max: 0.050194 -> Layer: shared_layers.0.bias | Grad Mean: 0.107231 | Grad Max: 0.634212 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001795 | Grad Max: 0.009074 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009501 | Grad Max: 0.009501 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000925 | Grad Max: 0.040238 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017622 | Grad Max: 0.230344 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000246 | Grad Max: 0.005977 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008172 | Grad Max: 0.029281 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000703 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002145 | Grad Max: 0.005244 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000501 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000936 | Grad Max: 0.002795 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002416 | Grad Max: 0.005485 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034603 | Grad Max: 0.034603 [GRADIENT NORM TOTAL] 2.1553 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.142 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.502561 0.49743906] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 124/1924 | C: 223/1825 [LOSS Ex1] A: 0.00000 | B: 0.68677 | C: 0.68199 [LOGITS Ex2 A] Mean Abs: 0.789 | Max: 3.807 [LOSS Ex2] A: 0.41852 | B: 0.45629 | C: 0.45369 ** [JOINT LOSS] ** : 0.899084 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001665 | Grad Max: 0.038104 -> Layer: shared_layers.0.bias | Grad Mean: 0.044771 | Grad Max: 0.234484 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001155 | Grad Max: 0.005166 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012895 | Grad Max: 0.012895 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000416 | Grad Max: 0.017086 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007818 | Grad Max: 0.097869 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000106 | Grad Max: 0.003127 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003406 | Grad Max: 0.012596 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000355 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000891 | Grad Max: 0.002350 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000233 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000391 | Grad Max: 0.001285 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001017 | Grad Max: 0.003526 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014603 | Grad Max: 0.014603 [GRADIENT NORM TOTAL] 0.9211 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.197 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.523057 0.476943] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 117/1931 | C: 223/1825 [LOSS Ex1] A: 0.68454 | B: 0.68511 | C: 0.68119 [LOGITS Ex2 A] Mean Abs: 0.811 | Max: 4.026 [LOSS Ex2] A: 0.41130 | B: 0.44828 | C: 0.44990 ** [JOINT LOSS] ** : 1.120104 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003477 | Grad Max: 0.088827 -> Layer: shared_layers.0.bias | Grad Mean: 0.117063 | Grad Max: 0.706028 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002048 | Grad Max: 0.010576 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016367 | Grad Max: 0.016367 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001107 | Grad Max: 0.043646 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020441 | Grad Max: 0.228140 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.007659 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009236 | Grad Max: 0.038722 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000804 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002444 | Grad Max: 0.006053 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000575 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001065 | Grad Max: 0.003236 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003230 | Grad Max: 0.008097 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041788 | Grad Max: 0.041788 [GRADIENT NORM TOTAL] 2.4028 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.221 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5079726 0.49202734] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 3/2045 | B: 123/1733 | C: 161/1215 [LOSS Ex1] A: 0.68549 | B: 0.68675 | C: 0.68155 [LOGITS Ex2 A] Mean Abs: 0.805 | Max: 3.687 [LOSS Ex2] A: 0.42106 | B: 0.44541 | C: 0.42835 ** [JOINT LOSS] ** : 1.116197 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004365 | Grad Max: 0.139549 -> Layer: shared_layers.0.bias | Grad Mean: 0.139533 | Grad Max: 0.829994 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001797 | Grad Max: 0.009246 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013727 | Grad Max: 0.013727 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001400 | Grad Max: 0.056489 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025738 | Grad Max: 0.298001 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000358 | Grad Max: 0.008807 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011673 | Grad Max: 0.041274 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001108 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003090 | Grad Max: 0.007734 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000634 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001337 | Grad Max: 0.003851 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003874 | Grad Max: 0.008774 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050594 | Grad Max: 0.050594 [GRADIENT NORM TOTAL] 2.9847 [EPOCH SUMMARY] Train Loss: 1.0680 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0980 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.1089 -> New: 1.0980) ############################## EPOCH 15/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.524183 0.475817] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 126/1922 | C: 229/1819 [LOSS Ex1] A: 0.00000 | B: 0.68643 | C: 0.68219 [LOGITS Ex2 A] Mean Abs: 0.813 | Max: 3.629 [LOSS Ex2] A: 0.41991 | B: 0.46469 | C: 0.45796 ** [JOINT LOSS] ** : 0.903726 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001875 | Grad Max: 0.054406 -> Layer: shared_layers.0.bias | Grad Mean: 0.045119 | Grad Max: 0.246264 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001127 | Grad Max: 0.005098 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011149 | Grad Max: 0.011149 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000467 | Grad Max: 0.023562 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008417 | Grad Max: 0.119230 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000116 | Grad Max: 0.002886 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003816 | Grad Max: 0.012512 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000429 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001025 | Grad Max: 0.002976 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000269 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000448 | Grad Max: 0.001612 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001381 | Grad Max: 0.003543 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018061 | Grad Max: 0.018061 [GRADIENT NORM TOTAL] 0.9825 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.112 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51095027 0.48904976] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/1616 | B: 126/1922 | C: 232/1816 [LOSS Ex1] A: 0.00000 | B: 0.68666 | C: 0.68137 [LOGITS Ex2 A] Mean Abs: 0.901 | Max: 3.543 [LOSS Ex2] A: 0.40658 | B: 0.47687 | C: 0.44158 ** [JOINT LOSS] ** : 0.897685 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003656 | Grad Max: 0.095113 -> Layer: shared_layers.0.bias | Grad Mean: 0.165471 | Grad Max: 0.983206 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001102 | Grad Max: 0.005238 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010056 | Grad Max: 0.010056 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001400 | Grad Max: 0.057887 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026895 | Grad Max: 0.331551 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000368 | Grad Max: 0.007726 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012188 | Grad Max: 0.040090 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.001004 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003185 | Grad Max: 0.007559 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000687 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001379 | Grad Max: 0.004206 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003768 | Grad Max: 0.007710 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051996 | Grad Max: 0.051996 [GRADIENT NORM TOTAL] 3.3040 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.222 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.511127 0.488873] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 122/1926 | C: 212/1836 [LOSS Ex1] A: 0.68439 | B: 0.68499 | C: 0.68132 [LOGITS Ex2 A] Mean Abs: 0.921 | Max: 3.644 [LOSS Ex2] A: 0.43437 | B: 0.47240 | C: 0.46133 ** [JOINT LOSS] ** : 1.139600 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006272 | Grad Max: 0.162702 -> Layer: shared_layers.0.bias | Grad Mean: 0.269086 | Grad Max: 1.610377 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.009855 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011943 | Grad Max: 0.011943 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002384 | Grad Max: 0.094208 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045219 | Grad Max: 0.541097 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000622 | Grad Max: 0.014403 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020522 | Grad Max: 0.070652 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000134 | Grad Max: 0.001768 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005358 | Grad Max: 0.012863 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000082 | Grad Max: 0.001058 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002317 | Grad Max: 0.006352 -> Layer: exit2_layers.12.weight | Grad Mean: 0.006632 | Grad Max: 0.014726 -> Layer: exit2_layers.12.bias | Grad Mean: 0.088396 | Grad Max: 0.088396 [GRADIENT NORM TOTAL] 5.4526 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.217 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5049488 0.4950512] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 1/2047 | B: 126/1730 | C: 227/1821 [LOSS Ex1] A: 0.68433 | B: 0.68664 | C: 0.68041 [LOGITS Ex2 A] Mean Abs: 0.886 | Max: 3.672 [LOSS Ex2] A: 0.43255 | B: 0.45696 | C: 0.43904 ** [JOINT LOSS] ** : 1.126643 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004443 | Grad Max: 0.121797 -> Layer: shared_layers.0.bias | Grad Mean: 0.182630 | Grad Max: 1.100712 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001930 | Grad Max: 0.009914 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009528 | Grad Max: 0.009528 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001614 | Grad Max: 0.066436 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030669 | Grad Max: 0.365904 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000421 | Grad Max: 0.009445 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013959 | Grad Max: 0.049194 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001119 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003644 | Grad Max: 0.008869 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000055 | Grad Max: 0.000754 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001574 | Grad Max: 0.004381 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004356 | Grad Max: 0.009856 -> Layer: exit2_layers.12.bias | Grad Mean: 0.059141 | Grad Max: 0.059141 [GRADIENT NORM TOTAL] 3.6998 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.222 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5098835 0.4901165] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 4/2044 | B: 127/1921 | C: 197/1851 [LOSS Ex1] A: 0.68344 | B: 0.68633 | C: 0.68492 [LOGITS Ex2 A] Mean Abs: 0.833 | Max: 3.726 [LOSS Ex2] A: 0.41380 | B: 0.46789 | C: 0.44870 ** [JOINT LOSS] ** : 1.128356 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.068573 -> Layer: shared_layers.0.bias | Grad Mean: 0.025011 | Grad Max: 0.111235 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.010936 -> Layer: exit1_layers.0.bias | Grad Mean: 0.021189 | Grad Max: 0.021189 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000328 | Grad Max: 0.035112 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005351 | Grad Max: 0.194873 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000064 | Grad Max: 0.002742 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001850 | Grad Max: 0.012712 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000287 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000471 | Grad Max: 0.001541 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000188 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000205 | Grad Max: 0.000865 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000593 | Grad Max: 0.002535 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007246 | Grad Max: 0.007246 [GRADIENT NORM TOTAL] 0.6686 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.198 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51133305 0.48866695] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 2/2046 | B: 129/1919 | C: 217/1831 [LOSS Ex1] A: 0.68468 | B: 0.68656 | C: 0.68239 [LOGITS Ex2 A] Mean Abs: 0.810 | Max: 3.886 [LOSS Ex2] A: 0.43080 | B: 0.45952 | C: 0.45868 ** [JOINT LOSS] ** : 1.134207 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004942 | Grad Max: 0.145242 -> Layer: shared_layers.0.bias | Grad Mean: 0.212113 | Grad Max: 1.295845 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001687 | Grad Max: 0.008271 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003929 | Grad Max: 0.003929 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001908 | Grad Max: 0.088786 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036259 | Grad Max: 0.484962 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000506 | Grad Max: 0.010945 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016781 | Grad Max: 0.059343 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000109 | Grad Max: 0.001501 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004379 | Grad Max: 0.010371 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000066 | Grad Max: 0.000866 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001888 | Grad Max: 0.005128 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005430 | Grad Max: 0.011634 -> Layer: exit2_layers.12.bias | Grad Mean: 0.072743 | Grad Max: 0.072743 [GRADIENT NORM TOTAL] 4.3527 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.027 | Max: 0.143 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029498 0.49705023] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.510 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 123/1925 | C: 243/1805 [LOSS Ex1] A: 0.00000 | B: 0.68488 | C: 0.67945 [LOGITS Ex2 A] Mean Abs: 0.790 | Max: 4.007 [LOSS Ex2] A: 0.45712 | B: 0.45712 | C: 0.46716 ** [JOINT LOSS] ** : 0.915241 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006780 | Grad Max: 0.183198 -> Layer: shared_layers.0.bias | Grad Mean: 0.272646 | Grad Max: 1.637387 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001276 | Grad Max: 0.005693 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007305 | Grad Max: 0.007305 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002493 | Grad Max: 0.092812 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047029 | Grad Max: 0.536257 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000655 | Grad Max: 0.014189 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021664 | Grad Max: 0.073138 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000141 | Grad Max: 0.001813 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005646 | Grad Max: 0.013761 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000086 | Grad Max: 0.001064 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002435 | Grad Max: 0.006552 -> Layer: exit2_layers.12.weight | Grad Mean: 0.007032 | Grad Max: 0.014172 -> Layer: exit2_layers.12.bias | Grad Mean: 0.093909 | Grad Max: 0.093909 [GRADIENT NORM TOTAL] 5.5698 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.200 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5238307 0.47616932] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 129/1727 | C: 227/1821 [LOSS Ex1] A: 0.68438 | B: 0.68653 | C: 0.68146 [LOGITS Ex2 A] Mean Abs: 0.837 | Max: 3.863 [LOSS Ex2] A: 0.41381 | B: 0.45280 | C: 0.46660 ** [JOINT LOSS] ** : 1.128529 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004911 | Grad Max: 0.147203 -> Layer: shared_layers.0.bias | Grad Mean: 0.197985 | Grad Max: 1.166105 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001930 | Grad Max: 0.010298 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016511 | Grad Max: 0.016511 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001824 | Grad Max: 0.076258 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034339 | Grad Max: 0.444724 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.010382 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015851 | Grad Max: 0.053935 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000103 | Grad Max: 0.001305 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004143 | Grad Max: 0.009849 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000063 | Grad Max: 0.000839 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001786 | Grad Max: 0.004959 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005176 | Grad Max: 0.011118 -> Layer: exit2_layers.12.bias | Grad Mean: 0.069346 | Grad Max: 0.069346 [GRADIENT NORM TOTAL] 4.0759 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.224 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5077299 0.49227008] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 3/2045 | B: 127/1921 | C: 226/1822 [LOSS Ex1] A: 0.68536 | B: 0.68622 | C: 0.68203 [LOGITS Ex2 A] Mean Abs: 0.831 | Max: 3.772 [LOSS Ex2] A: 0.40546 | B: 0.46067 | C: 0.42191 ** [JOINT LOSS] ** : 1.113883 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.052642 -> Layer: shared_layers.0.bias | Grad Mean: 0.019749 | Grad Max: 0.068982 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001856 | Grad Max: 0.009606 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015081 | Grad Max: 0.015081 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000281 | Grad Max: 0.020232 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004055 | Grad Max: 0.101093 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.001747 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001265 | Grad Max: 0.008387 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000288 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000344 | Grad Max: 0.001773 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000154 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000139 | Grad Max: 0.000694 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000886 | Grad Max: 0.002874 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005118 | Grad Max: 0.005118 [GRADIENT NORM TOTAL] 0.5379 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.524884 0.47511595] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/2048 | B: 131/1917 | C: 236/1812 [LOSS Ex1] A: 0.00000 | B: 0.68645 | C: 0.68282 [LOGITS Ex2 A] Mean Abs: 0.913 | Max: 3.748 [LOSS Ex2] A: 0.43396 | B: 0.47900 | C: 0.45125 ** [JOINT LOSS] ** : 0.911164 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004729 | Grad Max: 0.121480 -> Layer: shared_layers.0.bias | Grad Mean: 0.207725 | Grad Max: 1.284658 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001244 | Grad Max: 0.005050 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017454 | Grad Max: 0.017454 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001833 | Grad Max: 0.082115 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034886 | Grad Max: 0.429885 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000484 | Grad Max: 0.010176 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016052 | Grad Max: 0.052456 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000103 | Grad Max: 0.001332 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004153 | Grad Max: 0.009862 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000063 | Grad Max: 0.000842 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001780 | Grad Max: 0.004797 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005054 | Grad Max: 0.011616 -> Layer: exit2_layers.12.bias | Grad Mean: 0.067754 | Grad Max: 0.067754 [GRADIENT NORM TOTAL] 4.2740 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.115 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.511518 0.48848203] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.006 [MASKS] A(Pass/Fail): 0/1616 | B: 123/1925 | C: 221/1827 [LOSS Ex1] A: 0.00000 | B: 0.68477 | C: 0.68193 [LOGITS Ex2 A] Mean Abs: 0.966 | Max: 3.913 [LOSS Ex2] A: 0.42710 | B: 0.47495 | C: 0.45052 ** [JOINT LOSS] ** : 0.906420 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006219 | Grad Max: 0.179009 -> Layer: shared_layers.0.bias | Grad Mean: 0.269701 | Grad Max: 1.647094 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001184 | Grad Max: 0.005321 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013013 | Grad Max: 0.013013 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002389 | Grad Max: 0.104021 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045675 | Grad Max: 0.546358 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000629 | Grad Max: 0.014190 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020855 | Grad Max: 0.069593 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000134 | Grad Max: 0.001714 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005400 | Grad Max: 0.012614 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000081 | Grad Max: 0.001057 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002304 | Grad Max: 0.006476 -> Layer: exit2_layers.12.weight | Grad Mean: 0.006516 | Grad Max: 0.014732 -> Layer: exit2_layers.12.bias | Grad Mean: 0.086683 | Grad Max: 0.086683 [GRADIENT NORM TOTAL] 5.5142 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.225 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108996 0.4891004] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 130/1726 | C: 238/1810 [LOSS Ex1] A: 0.68424 | B: 0.68643 | C: 0.68162 [LOGITS Ex2 A] Mean Abs: 0.914 | Max: 3.843 [LOSS Ex2] A: 0.43530 | B: 0.45927 | C: 0.43118 ** [JOINT LOSS] ** : 1.126011 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005624 | Grad Max: 0.149093 -> Layer: shared_layers.0.bias | Grad Mean: 0.208167 | Grad Max: 1.265968 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001948 | Grad Max: 0.010410 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018221 | Grad Max: 0.018221 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001909 | Grad Max: 0.078900 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036033 | Grad Max: 0.435985 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000490 | Grad Max: 0.011118 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016207 | Grad Max: 0.054914 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000105 | Grad Max: 0.001260 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004201 | Grad Max: 0.009635 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000064 | Grad Max: 0.000880 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001794 | Grad Max: 0.005035 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005151 | Grad Max: 0.011869 -> Layer: exit2_layers.12.bias | Grad Mean: 0.068487 | Grad Max: 0.068487 [GRADIENT NORM TOTAL] 4.2644 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.220 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5046276 0.49537238] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 128/1920 | C: 209/1839 [LOSS Ex1] A: 0.68416 | B: 0.68612 | C: 0.68342 [LOGITS Ex2 A] Mean Abs: 0.856 | Max: 3.720 [LOSS Ex2] A: 0.40550 | B: 0.46512 | C: 0.43975 ** [JOINT LOSS] ** : 1.121355 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001493 | Grad Max: 0.038116 -> Layer: shared_layers.0.bias | Grad Mean: 0.010417 | Grad Max: 0.076368 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.010289 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016749 | Grad Max: 0.016749 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000182 | Grad Max: 0.014433 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002498 | Grad Max: 0.080229 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000030 | Grad Max: 0.002178 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000614 | Grad Max: 0.004940 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000184 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000136 | Grad Max: 0.000994 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000104 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000059 | Grad Max: 0.000406 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000592 | Grad Max: 0.001986 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000707 | Grad Max: 0.000707 [GRADIENT NORM TOTAL] 0.3535 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.224 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50958896 0.490411 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 4/2044 | B: 135/1913 | C: 133/1243 [LOSS Ex1] A: 0.68326 | B: 0.68636 | C: 0.68330 [LOGITS Ex2 A] Mean Abs: 0.831 | Max: 3.813 [LOSS Ex2] A: 0.41506 | B: 0.45669 | C: 0.47042 ** [JOINT LOSS] ** : 1.131695 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003122 | Grad Max: 0.094528 -> Layer: shared_layers.0.bias | Grad Mean: 0.160746 | Grad Max: 1.010098 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001920 | Grad Max: 0.010324 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015929 | Grad Max: 0.015929 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001390 | Grad Max: 0.058448 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026419 | Grad Max: 0.319131 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000367 | Grad Max: 0.008674 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012255 | Grad Max: 0.044864 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000995 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003155 | Grad Max: 0.007567 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000629 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001354 | Grad Max: 0.003908 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003934 | Grad Max: 0.008758 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052981 | Grad Max: 0.052981 [GRADIENT NORM TOTAL] 3.2790 [EPOCH SUMMARY] Train Loss: 1.0489 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.1015 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 16/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.200 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51105624 0.48894376] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 2/2046 | B: 124/1924 | C: 203/1845 [LOSS Ex1] A: 0.68455 | B: 0.68467 | C: 0.68411 [LOGITS Ex2 A] Mean Abs: 0.827 | Max: 3.921 [LOSS Ex2] A: 0.43074 | B: 0.44308 | C: 0.47362 ** [JOINT LOSS] ** : 1.133587 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004487 | Grad Max: 0.132604 -> Layer: shared_layers.0.bias | Grad Mean: 0.179252 | Grad Max: 1.065669 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001715 | Grad Max: 0.008490 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009162 | Grad Max: 0.009162 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001612 | Grad Max: 0.061179 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030387 | Grad Max: 0.339816 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000421 | Grad Max: 0.010141 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013990 | Grad Max: 0.052960 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001180 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003624 | Grad Max: 0.008381 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.000736 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001549 | Grad Max: 0.004320 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004508 | Grad Max: 0.009440 -> Layer: exit2_layers.12.bias | Grad Mean: 0.059915 | Grad Max: 0.059915 [GRADIENT NORM TOTAL] 3.6228 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.144 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034008 0.49659914] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 0/2048 | B: 132/1724 | C: 231/1817 [LOSS Ex1] A: 0.00000 | B: 0.68633 | C: 0.68203 [LOGITS Ex2 A] Mean Abs: 0.803 | Max: 3.944 [LOSS Ex2] A: 0.40524 | B: 0.44198 | C: 0.43917 ** [JOINT LOSS] ** : 0.884918 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002295 | Grad Max: 0.071360 -> Layer: shared_layers.0.bias | Grad Mean: 0.091014 | Grad Max: 0.539404 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001182 | Grad Max: 0.005291 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013430 | Grad Max: 0.013430 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000841 | Grad Max: 0.039313 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015776 | Grad Max: 0.227533 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.005422 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007428 | Grad Max: 0.025816 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000735 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001936 | Grad Max: 0.004825 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000388 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000822 | Grad Max: 0.002284 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002440 | Grad Max: 0.006008 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031566 | Grad Max: 0.031566 [GRADIENT NORM TOTAL] 1.9216 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.202 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5247694 0.47523054] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 128/1920 | C: 233/1815 [LOSS Ex1] A: 0.68419 | B: 0.68601 | C: 0.68034 [LOGITS Ex2 A] Mean Abs: 0.889 | Max: 4.247 [LOSS Ex2] A: 0.42121 | B: 0.48323 | C: 0.43635 ** [JOINT LOSS] ** : 1.130441 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004491 | Grad Max: 0.123723 -> Layer: shared_layers.0.bias | Grad Mean: 0.160572 | Grad Max: 0.970658 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001976 | Grad Max: 0.010220 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012871 | Grad Max: 0.012871 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001460 | Grad Max: 0.065423 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027273 | Grad Max: 0.364945 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000371 | Grad Max: 0.008141 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012190 | Grad Max: 0.042043 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.001042 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003146 | Grad Max: 0.007951 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000689 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001335 | Grad Max: 0.003885 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003765 | Grad Max: 0.007837 -> Layer: exit2_layers.12.bias | Grad Mean: 0.049801 | Grad Max: 0.049801 [GRADIENT NORM TOTAL] 3.2879 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.227 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.507427 0.492573] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 3/2045 | B: 136/1912 | C: 239/1809 [LOSS Ex1] A: 0.68521 | B: 0.68626 | C: 0.68083 [LOGITS Ex2 A] Mean Abs: 0.932 | Max: 3.791 [LOSS Ex2] A: 0.43016 | B: 0.48678 | C: 0.45217 ** [JOINT LOSS] ** : 1.140468 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005027 | Grad Max: 0.147015 -> Layer: shared_layers.0.bias | Grad Mean: 0.254324 | Grad Max: 1.569784 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001789 | Grad Max: 0.009115 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010002 | Grad Max: 0.010002 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002205 | Grad Max: 0.095881 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042344 | Grad Max: 0.535378 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000583 | Grad Max: 0.012295 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019491 | Grad Max: 0.065240 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000123 | Grad Max: 0.001608 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005006 | Grad Max: 0.012105 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000074 | Grad Max: 0.000885 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002116 | Grad Max: 0.005856 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005940 | Grad Max: 0.013931 -> Layer: exit2_layers.12.bias | Grad Mean: 0.079019 | Grad Max: 0.079019 [GRADIENT NORM TOTAL] 5.2178 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5257787 0.47422132] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 0/2048 | B: 127/1921 | C: 216/1832 [LOSS Ex1] A: 0.00000 | B: 0.68456 | C: 0.68386 [LOGITS Ex2 A] Mean Abs: 0.906 | Max: 3.868 [LOSS Ex2] A: 0.41577 | B: 0.46173 | C: 0.43326 ** [JOINT LOSS] ** : 0.893063 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004417 | Grad Max: 0.120156 -> Layer: shared_layers.0.bias | Grad Mean: 0.207726 | Grad Max: 1.262976 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001332 | Grad Max: 0.005507 -> Layer: exit1_layers.0.bias | Grad Mean: 0.020607 | Grad Max: 0.020607 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001801 | Grad Max: 0.075290 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034295 | Grad Max: 0.423118 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000471 | Grad Max: 0.010700 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015706 | Grad Max: 0.051935 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000099 | Grad Max: 0.001301 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004034 | Grad Max: 0.009598 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000060 | Grad Max: 0.000722 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001711 | Grad Max: 0.004612 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004786 | Grad Max: 0.009870 -> Layer: exit2_layers.12.bias | Grad Mean: 0.064589 | Grad Max: 0.064589 [GRADIENT NORM TOTAL] 4.2212 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.117 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51220924 0.4877908 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 0/1616 | B: 133/1723 | C: 203/1845 [LOSS Ex1] A: 0.00000 | B: 0.68624 | C: 0.68257 [LOGITS Ex2 A] Mean Abs: 0.894 | Max: 4.048 [LOSS Ex2] A: 0.38565 | B: 0.44125 | C: 0.47047 ** [JOINT LOSS] ** : 0.888723 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001864 | Grad Max: 0.045406 -> Layer: shared_layers.0.bias | Grad Mean: 0.019875 | Grad Max: 0.087856 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001040 | Grad Max: 0.004993 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006941 | Grad Max: 0.006941 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000272 | Grad Max: 0.025610 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004334 | Grad Max: 0.144034 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.002505 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001587 | Grad Max: 0.009628 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000253 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000426 | Grad Max: 0.001868 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000148 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000176 | Grad Max: 0.000697 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000614 | Grad Max: 0.002544 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006703 | Grad Max: 0.006703 [GRADIENT NORM TOTAL] 0.5506 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.227 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5106218 0.4893782] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 2/2046 | B: 128/1920 | C: 199/1849 [LOSS Ex1] A: 0.68406 | B: 0.68592 | C: 0.68269 [LOGITS Ex2 A] Mean Abs: 0.879 | Max: 4.162 [LOSS Ex2] A: 0.40121 | B: 0.46604 | C: 0.43272 ** [JOINT LOSS] ** : 1.117548 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002475 | Grad Max: 0.060079 -> Layer: shared_layers.0.bias | Grad Mean: 0.083778 | Grad Max: 0.479013 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001921 | Grad Max: 0.010175 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017274 | Grad Max: 0.017274 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000749 | Grad Max: 0.038670 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013972 | Grad Max: 0.213861 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.003960 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006263 | Grad Max: 0.020095 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000591 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001614 | Grad Max: 0.004303 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000323 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000682 | Grad Max: 0.001933 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001985 | Grad Max: 0.004789 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026148 | Grad Max: 0.026148 [GRADIENT NORM TOTAL] 1.6908 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.222 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50428486 0.4957151 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 138/1910 | C: 213/1835 [LOSS Ex1] A: 0.68397 | B: 0.68616 | C: 0.68371 [LOGITS Ex2 A] Mean Abs: 0.859 | Max: 3.993 [LOSS Ex2] A: 0.39865 | B: 0.44419 | C: 0.43523 ** [JOINT LOSS] ** : 1.110633 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001903 | Grad Max: 0.042184 -> Layer: shared_layers.0.bias | Grad Mean: 0.073431 | Grad Max: 0.413499 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.010720 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022378 | Grad Max: 0.022378 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000622 | Grad Max: 0.027579 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011491 | Grad Max: 0.147508 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000156 | Grad Max: 0.004581 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005188 | Grad Max: 0.018807 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000500 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001344 | Grad Max: 0.003814 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000330 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000563 | Grad Max: 0.001822 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001663 | Grad Max: 0.004850 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020945 | Grad Max: 0.020945 [GRADIENT NORM TOTAL] 1.4449 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.227 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50926006 0.4907399 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 4/2044 | B: 127/1921 | C: 221/1827 [LOSS Ex1] A: 0.68308 | B: 0.68446 | C: 0.68100 [LOGITS Ex2 A] Mean Abs: 0.892 | Max: 4.002 [LOSS Ex2] A: 0.40651 | B: 0.43981 | C: 0.44363 ** [JOINT LOSS] ** : 1.112829 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003407 | Grad Max: 0.112847 -> Layer: shared_layers.0.bias | Grad Mean: 0.105245 | Grad Max: 0.611043 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.010381 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011301 | Grad Max: 0.011301 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001031 | Grad Max: 0.048134 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019112 | Grad Max: 0.256926 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000261 | Grad Max: 0.006098 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008536 | Grad Max: 0.030141 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000787 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002185 | Grad Max: 0.005304 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000454 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000922 | Grad Max: 0.002615 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002532 | Grad Max: 0.005693 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034189 | Grad Max: 0.034189 [GRADIENT NORM TOTAL] 2.2326 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.202 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51074445 0.48925558] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 2/2046 | B: 134/1722 | C: 244/1804 [LOSS Ex1] A: 0.68443 | B: 0.68613 | C: 0.67933 [LOGITS Ex2 A] Mean Abs: 0.888 | Max: 3.998 [LOSS Ex2] A: 0.40921 | B: 0.43983 | C: 0.42373 ** [JOINT LOSS] ** : 1.107554 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002722 | Grad Max: 0.074738 -> Layer: shared_layers.0.bias | Grad Mean: 0.096372 | Grad Max: 0.544514 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001867 | Grad Max: 0.009163 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007225 | Grad Max: 0.007225 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000872 | Grad Max: 0.040852 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016399 | Grad Max: 0.220382 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000225 | Grad Max: 0.005276 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007439 | Grad Max: 0.025759 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000644 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001899 | Grad Max: 0.004728 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000394 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000809 | Grad Max: 0.002396 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002212 | Grad Max: 0.005645 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030404 | Grad Max: 0.030404 [GRADIENT NORM TOTAL] 1.9474 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.028 | Max: 0.144 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5038853 0.4961147] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 129/1919 | C: 217/1831 [LOSS Ex1] A: 0.68406 | B: 0.68581 | C: 0.68041 [LOGITS Ex2 A] Mean Abs: 0.828 | Max: 3.761 [LOSS Ex2] A: 0.40028 | B: 0.45402 | C: 0.44444 ** [JOINT LOSS] ** : 1.116334 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001494 | Grad Max: 0.043796 -> Layer: shared_layers.0.bias | Grad Mean: 0.062049 | Grad Max: 0.390537 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001841 | Grad Max: 0.008442 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002821 | Grad Max: 0.002821 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000585 | Grad Max: 0.041597 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010837 | Grad Max: 0.226150 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000145 | Grad Max: 0.003746 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004877 | Grad Max: 0.017263 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000431 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001259 | Grad Max: 0.003135 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000247 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000534 | Grad Max: 0.001536 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001660 | Grad Max: 0.004499 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021715 | Grad Max: 0.021715 [GRADIENT NORM TOTAL] 1.3588 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.205 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52569985 0.47430015] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 141/1907 | C: 225/1823 [LOSS Ex1] A: 0.68400 | B: 0.68605 | C: 0.68160 [LOGITS Ex2 A] Mean Abs: 0.862 | Max: 3.802 [LOSS Ex2] A: 0.39726 | B: 0.44786 | C: 0.43976 ** [JOINT LOSS] ** : 1.112172 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001472 | Grad Max: 0.029595 -> Layer: shared_layers.0.bias | Grad Mean: 0.030065 | Grad Max: 0.172416 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001974 | Grad Max: 0.010226 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017991 | Grad Max: 0.017991 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000357 | Grad Max: 0.023048 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006096 | Grad Max: 0.119529 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.002758 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002498 | Grad Max: 0.013571 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000313 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000644 | Grad Max: 0.002138 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000205 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000273 | Grad Max: 0.001040 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000987 | Grad Max: 0.003157 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012046 | Grad Max: 0.012046 [GRADIENT NORM TOTAL] 0.7164 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.229 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071367 0.49286327] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 3/2045 | B: 127/1921 | C: 244/1804 [LOSS Ex1] A: 0.68506 | B: 0.68433 | C: 0.68056 [LOGITS Ex2 A] Mean Abs: 0.914 | Max: 3.916 [LOSS Ex2] A: 0.37902 | B: 0.44053 | C: 0.41107 ** [JOINT LOSS] ** : 1.093522 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001631 | Grad Max: 0.030929 -> Layer: shared_layers.0.bias | Grad Mean: 0.074335 | Grad Max: 0.445686 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001930 | Grad Max: 0.009660 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013553 | Grad Max: 0.013553 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000596 | Grad Max: 0.030112 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011218 | Grad Max: 0.166252 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000152 | Grad Max: 0.003910 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005129 | Grad Max: 0.020468 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000455 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001286 | Grad Max: 0.003408 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000279 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000545 | Grad Max: 0.001668 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001376 | Grad Max: 0.004385 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020409 | Grad Max: 0.020409 [GRADIENT NORM TOTAL] 1.4699 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5267029 0.4732971] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 0/2048 | B: 134/1722 | C: 160/1216 [LOSS Ex1] A: 0.00000 | B: 0.68601 | C: 0.68014 [LOGITS Ex2 A] Mean Abs: 0.886 | Max: 4.165 [LOSS Ex2] A: 0.40681 | B: 0.43759 | C: 0.41315 ** [JOINT LOSS] ** : 0.874566 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001323 | Grad Max: 0.017181 -> Layer: shared_layers.0.bias | Grad Mean: 0.020848 | Grad Max: 0.133769 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001155 | Grad Max: 0.005266 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009209 | Grad Max: 0.009209 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000211 | Grad Max: 0.035727 -> Layer: exit2_layers.0.bias | Grad Mean: 0.003582 | Grad Max: 0.196897 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002199 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001247 | Grad Max: 0.009574 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000239 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000300 | Grad Max: 0.001464 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000167 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000139 | Grad Max: 0.000923 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000816 | Grad Max: 0.002617 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005880 | Grad Max: 0.005880 [GRADIENT NORM TOTAL] 0.5293 [EPOCH SUMMARY] Train Loss: 1.0512 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0864 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0980 -> New: 1.0864) ############################## EPOCH 17/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.120 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51294786 0.48705214] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 0/1616 | B: 130/1918 | C: 225/1823 [LOSS Ex1] A: 0.00000 | B: 0.68568 | C: 0.68134 [LOGITS Ex2 A] Mean Abs: 0.913 | Max: 3.896 [LOSS Ex2] A: 0.39461 | B: 0.46118 | C: 0.43143 ** [JOINT LOSS] ** : 0.884746 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.055038 -> Layer: shared_layers.0.bias | Grad Mean: 0.067780 | Grad Max: 0.378242 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001155 | Grad Max: 0.005091 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012619 | Grad Max: 0.012619 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000641 | Grad Max: 0.035616 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011567 | Grad Max: 0.180236 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000158 | Grad Max: 0.004128 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005278 | Grad Max: 0.018436 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000564 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001363 | Grad Max: 0.003364 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000321 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000567 | Grad Max: 0.001848 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001724 | Grad Max: 0.004282 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021690 | Grad Max: 0.021690 [GRADIENT NORM TOTAL] 1.3920 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.231 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5103812 0.4896188] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 2/2046 | B: 143/1905 | C: 236/1812 [LOSS Ex1] A: 0.68386 | B: 0.68592 | C: 0.68151 [LOGITS Ex2 A] Mean Abs: 0.925 | Max: 4.098 [LOSS Ex2] A: 0.40267 | B: 0.44647 | C: 0.43113 ** [JOINT LOSS] ** : 1.110520 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001332 | Grad Max: 0.036418 -> Layer: shared_layers.0.bias | Grad Mean: 0.035328 | Grad Max: 0.180566 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001885 | Grad Max: 0.009974 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014690 | Grad Max: 0.014690 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000332 | Grad Max: 0.021158 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006027 | Grad Max: 0.112314 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.003064 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002432 | Grad Max: 0.014657 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000330 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000611 | Grad Max: 0.001923 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000188 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000252 | Grad Max: 0.001035 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000599 | Grad Max: 0.002266 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008144 | Grad Max: 0.008144 [GRADIENT NORM TOTAL] 0.7514 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.225 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50389457 0.49610546] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 127/1921 | C: 221/1827 [LOSS Ex1] A: 0.68372 | B: 0.68418 | C: 0.68166 [LOGITS Ex2 A] Mean Abs: 0.916 | Max: 4.210 [LOSS Ex2] A: 0.39187 | B: 0.42916 | C: 0.42168 ** [JOINT LOSS] ** : 1.097426 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001292 | Grad Max: 0.017251 -> Layer: shared_layers.0.bias | Grad Mean: 0.010881 | Grad Max: 0.079772 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.010810 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018295 | Grad Max: 0.018295 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000160 | Grad Max: 0.019682 -> Layer: exit2_layers.0.bias | Grad Mean: 0.002270 | Grad Max: 0.111924 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000029 | Grad Max: 0.001832 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000635 | Grad Max: 0.006618 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000195 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000133 | Grad Max: 0.000889 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000111 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000055 | Grad Max: 0.000475 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000749 | Grad Max: 0.002079 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000029 | Grad Max: 0.000029 [GRADIENT NORM TOTAL] 0.3289 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.230 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50896627 0.49103376] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 4/2044 | B: 134/1722 | C: 197/1851 [LOSS Ex1] A: 0.68283 | B: 0.68586 | C: 0.68368 [LOGITS Ex2 A] Mean Abs: 0.928 | Max: 4.508 [LOSS Ex2] A: 0.39535 | B: 0.43602 | C: 0.44188 ** [JOINT LOSS] ** : 1.108540 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001449 | Grad Max: 0.031632 -> Layer: shared_layers.0.bias | Grad Mean: 0.041764 | Grad Max: 0.256673 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001949 | Grad Max: 0.010308 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015618 | Grad Max: 0.015618 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000399 | Grad Max: 0.019510 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006800 | Grad Max: 0.110206 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.002685 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002626 | Grad Max: 0.012104 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000301 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000650 | Grad Max: 0.002221 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000181 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000270 | Grad Max: 0.001121 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000797 | Grad Max: 0.002680 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010639 | Grad Max: 0.010639 [GRADIENT NORM TOTAL] 0.8587 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.204 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104768 0.48952317] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 2/2046 | B: 130/1918 | C: 234/1814 [LOSS Ex1] A: 0.68423 | B: 0.68552 | C: 0.68138 [LOGITS Ex2 A] Mean Abs: 0.914 | Max: 4.258 [LOSS Ex2] A: 0.40070 | B: 0.45908 | C: 0.42844 ** [JOINT LOSS] ** : 1.113118 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001475 | Grad Max: 0.035359 -> Layer: shared_layers.0.bias | Grad Mean: 0.059925 | Grad Max: 0.343280 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001763 | Grad Max: 0.008671 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008451 | Grad Max: 0.008451 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000483 | Grad Max: 0.026277 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009010 | Grad Max: 0.143426 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000116 | Grad Max: 0.003344 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003835 | Grad Max: 0.013804 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000425 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000951 | Grad Max: 0.003013 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000236 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000382 | Grad Max: 0.001253 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000883 | Grad Max: 0.002414 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012854 | Grad Max: 0.012854 [GRADIENT NORM TOTAL] 1.1869 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.029 | Max: 0.146 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044901 0.49550986] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.007 [MASKS] A(Pass/Fail): 1/2047 | B: 144/1904 | C: 247/1801 [LOSS Ex1] A: 0.68385 | B: 0.68576 | C: 0.67952 [LOGITS Ex2 A] Mean Abs: 0.895 | Max: 4.391 [LOSS Ex2] A: 0.39381 | B: 0.44478 | C: 0.42799 ** [JOINT LOSS] ** : 1.105236 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001778 | Grad Max: 0.047290 -> Layer: shared_layers.0.bias | Grad Mean: 0.020806 | Grad Max: 0.102459 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001864 | Grad Max: 0.008561 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001383 | Grad Max: 0.001383 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000266 | Grad Max: 0.017881 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004454 | Grad Max: 0.097392 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.003075 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001875 | Grad Max: 0.010899 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000274 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000486 | Grad Max: 0.001711 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000143 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000198 | Grad Max: 0.000736 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000821 | Grad Max: 0.002719 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008510 | Grad Max: 0.008510 [GRADIENT NORM TOTAL] 0.5355 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5270523 0.47294778] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 1/2047 | B: 128/1920 | C: 208/1840 [LOSS Ex1] A: 0.68369 | B: 0.68401 | C: 0.68308 [LOGITS Ex2 A] Mean Abs: 0.943 | Max: 4.507 [LOSS Ex2] A: 0.38491 | B: 0.44137 | C: 0.44218 ** [JOINT LOSS] ** : 1.106415 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001765 | Grad Max: 0.039053 -> Layer: shared_layers.0.bias | Grad Mean: 0.052567 | Grad Max: 0.330271 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.010672 -> Layer: exit1_layers.0.bias | Grad Mean: 0.021163 | Grad Max: 0.021163 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000463 | Grad Max: 0.023262 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008671 | Grad Max: 0.128519 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000118 | Grad Max: 0.003510 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003989 | Grad Max: 0.016820 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000379 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000979 | Grad Max: 0.002836 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000231 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000397 | Grad Max: 0.001301 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000931 | Grad Max: 0.002892 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013813 | Grad Max: 0.013813 [GRADIENT NORM TOTAL] 1.0945 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.234 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068086 0.49319148] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 3/2045 | B: 135/1721 | C: 231/1817 [LOSS Ex1] A: 0.68481 | B: 0.68570 | C: 0.68096 [LOGITS Ex2 A] Mean Abs: 0.933 | Max: 4.288 [LOSS Ex2] A: 0.39978 | B: 0.42745 | C: 0.43220 ** [JOINT LOSS] ** : 1.103632 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003737 | Grad Max: 0.101406 -> Layer: shared_layers.0.bias | Grad Mean: 0.080079 | Grad Max: 0.427770 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001788 | Grad Max: 0.008766 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012580 | Grad Max: 0.012580 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000813 | Grad Max: 0.061041 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014659 | Grad Max: 0.307049 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000193 | Grad Max: 0.004699 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006338 | Grad Max: 0.022008 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000584 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001607 | Grad Max: 0.003855 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000333 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000655 | Grad Max: 0.001964 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002066 | Grad Max: 0.004657 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025284 | Grad Max: 0.025284 [GRADIENT NORM TOTAL] 1.7521 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52806 0.47193998] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 0/2048 | B: 132/1916 | C: 228/1820 [LOSS Ex1] A: 0.00000 | B: 0.68535 | C: 0.68067 [LOGITS Ex2 A] Mean Abs: 0.928 | Max: 4.399 [LOSS Ex2] A: 0.39462 | B: 0.46194 | C: 0.43539 ** [JOINT LOSS] ** : 0.885988 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001841 | Grad Max: 0.043215 -> Layer: shared_layers.0.bias | Grad Mean: 0.064381 | Grad Max: 0.397835 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001182 | Grad Max: 0.005337 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011015 | Grad Max: 0.011015 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000606 | Grad Max: 0.058803 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011054 | Grad Max: 0.334285 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000145 | Grad Max: 0.004520 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004814 | Grad Max: 0.020439 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000466 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001199 | Grad Max: 0.003454 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000275 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000483 | Grad Max: 0.001457 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001504 | Grad Max: 0.003486 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018269 | Grad Max: 0.018269 [GRADIENT NORM TOTAL] 1.4570 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.124 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5140021 0.48599786] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 0/1616 | B: 144/1904 | C: 238/1810 [LOSS Ex1] A: 0.00000 | B: 0.68560 | C: 0.68105 [LOGITS Ex2 A] Mean Abs: 1.003 | Max: 4.162 [LOSS Ex2] A: 0.38482 | B: 0.44620 | C: 0.43731 ** [JOINT LOSS] ** : 0.878329 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003411 | Grad Max: 0.104121 -> Layer: shared_layers.0.bias | Grad Mean: 0.100101 | Grad Max: 0.545095 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001108 | Grad Max: 0.005435 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007526 | Grad Max: 0.007526 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000966 | Grad Max: 0.044281 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017970 | Grad Max: 0.239407 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000240 | Grad Max: 0.005817 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007926 | Grad Max: 0.029930 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000642 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001957 | Grad Max: 0.005000 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000392 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000785 | Grad Max: 0.002267 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002222 | Grad Max: 0.004948 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028397 | Grad Max: 0.028397 [GRADIENT NORM TOTAL] 2.0778 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.236 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5100983 0.4899017] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 2/2046 | B: 128/1920 | C: 229/1819 [LOSS Ex1] A: 0.68354 | B: 0.68383 | C: 0.67981 [LOGITS Ex2 A] Mean Abs: 0.988 | Max: 5.048 [LOSS Ex2] A: 0.40170 | B: 0.44208 | C: 0.42305 ** [JOINT LOSS] ** : 1.104673 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004612 | Grad Max: 0.148670 -> Layer: shared_layers.0.bias | Grad Mean: 0.140364 | Grad Max: 0.782256 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001998 | Grad Max: 0.009659 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008097 | Grad Max: 0.008097 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001320 | Grad Max: 0.063392 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024487 | Grad Max: 0.335590 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000328 | Grad Max: 0.008016 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010873 | Grad Max: 0.036998 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000848 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002698 | Grad Max: 0.006339 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000531 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001076 | Grad Max: 0.002879 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003085 | Grad Max: 0.005850 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038776 | Grad Max: 0.038776 [GRADIENT NORM TOTAL] 2.8737 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.229 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.503425 0.496575] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 1/2047 | B: 135/1721 | C: 230/1818 [LOSS Ex1] A: 0.68337 | B: 0.68553 | C: 0.68031 [LOGITS Ex2 A] Mean Abs: 0.957 | Max: 4.203 [LOSS Ex2] A: 0.38450 | B: 0.42459 | C: 0.45147 ** [JOINT LOSS] ** : 1.103257 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001434 | Grad Max: 0.043666 -> Layer: shared_layers.0.bias | Grad Mean: 0.055729 | Grad Max: 0.348707 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001964 | Grad Max: 0.010282 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014600 | Grad Max: 0.014600 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000505 | Grad Max: 0.031430 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008811 | Grad Max: 0.179186 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.003186 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003606 | Grad Max: 0.015426 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000407 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000900 | Grad Max: 0.002729 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000227 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000362 | Grad Max: 0.001220 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001040 | Grad Max: 0.002555 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014188 | Grad Max: 0.014188 [GRADIENT NORM TOTAL] 1.1900 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.235 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50861263 0.4913874 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 4/2044 | B: 132/1916 | C: 214/1834 [LOSS Ex1] A: 0.68248 | B: 0.68519 | C: 0.68154 [LOGITS Ex2 A] Mean Abs: 0.954 | Max: 4.339 [LOSS Ex2] A: 0.38966 | B: 0.45685 | C: 0.42298 ** [JOINT LOSS] ** : 1.106233 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001976 | Grad Max: 0.057758 -> Layer: shared_layers.0.bias | Grad Mean: 0.081006 | Grad Max: 0.501004 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.010534 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014468 | Grad Max: 0.014468 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000743 | Grad Max: 0.046758 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012969 | Grad Max: 0.265081 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000162 | Grad Max: 0.004678 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005408 | Grad Max: 0.022936 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000503 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001323 | Grad Max: 0.003733 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000296 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000529 | Grad Max: 0.001688 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001476 | Grad Max: 0.003920 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019959 | Grad Max: 0.019959 [GRADIENT NORM TOTAL] 1.6954 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.208 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5101837 0.48981625] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 2/2046 | B: 145/1903 | C: 140/1236 [LOSS Ex1] A: 0.68397 | B: 0.68544 | C: 0.68114 [LOGITS Ex2 A] Mean Abs: 0.966 | Max: 4.321 [LOSS Ex2] A: 0.37702 | B: 0.44031 | C: 0.43515 ** [JOINT LOSS] ** : 1.101010 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001506 | Grad Max: 0.023875 -> Layer: shared_layers.0.bias | Grad Mean: 0.054680 | Grad Max: 0.330586 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001781 | Grad Max: 0.008914 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009138 | Grad Max: 0.009138 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000464 | Grad Max: 0.029429 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008470 | Grad Max: 0.167503 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000107 | Grad Max: 0.004165 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003587 | Grad Max: 0.018441 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000339 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000860 | Grad Max: 0.002670 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000281 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000341 | Grad Max: 0.001238 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000735 | Grad Max: 0.002763 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011444 | Grad Max: 0.011444 [GRADIENT NORM TOTAL] 1.1379 [EPOCH SUMMARY] Train Loss: 1.0578 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0761 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0864 -> New: 1.0761) ############################## EPOCH 18/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.030 | Max: 0.148 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50518167 0.49481836] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 2/2046 | B: 130/1918 | C: 221/1827 [LOSS Ex1] A: 0.68358 | B: 0.68366 | C: 0.68179 [LOGITS Ex2 A] Mean Abs: 0.929 | Max: 4.297 [LOSS Ex2] A: 0.38223 | B: 0.44288 | C: 0.43252 ** [JOINT LOSS] ** : 1.102214 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001366 | Grad Max: 0.027965 -> Layer: shared_layers.0.bias | Grad Mean: 0.030940 | Grad Max: 0.231439 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001827 | Grad Max: 0.009095 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007431 | Grad Max: 0.007431 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000311 | Grad Max: 0.032581 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005317 | Grad Max: 0.182568 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.002370 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001681 | Grad Max: 0.010042 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000306 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000379 | Grad Max: 0.001505 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000145 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000150 | Grad Max: 0.000692 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000470 | Grad Max: 0.001811 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005432 | Grad Max: 0.005432 [GRADIENT NORM TOTAL] 0.7187 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.213 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52862984 0.4713702 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.008 [MASKS] A(Pass/Fail): 1/2047 | B: 136/1720 | C: 213/1835 [LOSS Ex1] A: 0.68334 | B: 0.68537 | C: 0.68066 [LOGITS Ex2 A] Mean Abs: 0.982 | Max: 5.019 [LOSS Ex2] A: 0.37694 | B: 0.42463 | C: 0.42356 ** [JOINT LOSS] ** : 1.091500 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002703 | Grad Max: 0.075449 -> Layer: shared_layers.0.bias | Grad Mean: 0.078013 | Grad Max: 0.424874 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001993 | Grad Max: 0.010461 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018169 | Grad Max: 0.018169 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000739 | Grad Max: 0.038404 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013496 | Grad Max: 0.213731 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000176 | Grad Max: 0.005135 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005871 | Grad Max: 0.023869 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000513 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001452 | Grad Max: 0.003651 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000292 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000574 | Grad Max: 0.001704 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001818 | Grad Max: 0.004966 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021654 | Grad Max: 0.021654 [GRADIENT NORM TOTAL] 1.5997 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.239 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50648326 0.49351668] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 3/2045 | B: 132/1916 | C: 246/1802 [LOSS Ex1] A: 0.68452 | B: 0.68502 | C: 0.68001 [LOGITS Ex2 A] Mean Abs: 0.976 | Max: 4.307 [LOSS Ex2] A: 0.39080 | B: 0.45223 | C: 0.41307 ** [JOINT LOSS] ** : 1.101884 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.055476 -> Layer: shared_layers.0.bias | Grad Mean: 0.022125 | Grad Max: 0.114261 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001802 | Grad Max: 0.009029 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009375 | Grad Max: 0.009375 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000299 | Grad Max: 0.018715 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004436 | Grad Max: 0.069392 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.002020 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001327 | Grad Max: 0.008990 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000212 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000345 | Grad Max: 0.001486 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000155 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000143 | Grad Max: 0.000684 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000629 | Grad Max: 0.002054 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006199 | Grad Max: 0.006199 [GRADIENT NORM TOTAL] 0.5858 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.087 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5294981 0.47050187] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 0/2048 | B: 146/1902 | C: 226/1822 [LOSS Ex1] A: 0.00000 | B: 0.68528 | C: 0.68107 [LOGITS Ex2 A] Mean Abs: 0.995 | Max: 4.402 [LOSS Ex2] A: 0.38579 | B: 0.45116 | C: 0.42474 ** [JOINT LOSS] ** : 0.876009 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002336 | Grad Max: 0.055560 -> Layer: shared_layers.0.bias | Grad Mean: 0.100298 | Grad Max: 0.592342 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001162 | Grad Max: 0.005203 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012743 | Grad Max: 0.012743 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000857 | Grad Max: 0.040604 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016164 | Grad Max: 0.233396 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.006302 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007441 | Grad Max: 0.034691 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000540 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001790 | Grad Max: 0.004390 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000347 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000704 | Grad Max: 0.001957 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001852 | Grad Max: 0.004123 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024880 | Grad Max: 0.024880 [GRADIENT NORM TOTAL] 2.0622 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.128 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5150813 0.4849187] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.008 [MASKS] A(Pass/Fail): 0/1616 | B: 130/1918 | C: 228/1820 [LOSS Ex1] A: 0.00000 | B: 0.68348 | C: 0.68159 [LOGITS Ex2 A] Mean Abs: 1.030 | Max: 4.586 [LOSS Ex2] A: 0.37365 | B: 0.43036 | C: 0.42989 ** [JOINT LOSS] ** : 0.866324 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002219 | Grad Max: 0.050695 -> Layer: shared_layers.0.bias | Grad Mean: 0.084209 | Grad Max: 0.490123 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001204 | Grad Max: 0.005184 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013601 | Grad Max: 0.013601 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000725 | Grad Max: 0.066515 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013542 | Grad Max: 0.376407 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000182 | Grad Max: 0.005188 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006085 | Grad Max: 0.023624 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000520 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001472 | Grad Max: 0.003624 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000348 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000570 | Grad Max: 0.001799 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001553 | Grad Max: 0.003574 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020352 | Grad Max: 0.020352 [GRADIENT NORM TOTAL] 1.7422 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.241 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5098773 0.4901227] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 2/2046 | B: 136/1720 | C: 243/1805 [LOSS Ex1] A: 0.68321 | B: 0.68521 | C: 0.67947 [LOGITS Ex2 A] Mean Abs: 0.989 | Max: 5.067 [LOSS Ex2] A: 0.37965 | B: 0.42728 | C: 0.44940 ** [JOINT LOSS] ** : 1.101404 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002301 | Grad Max: 0.072137 -> Layer: shared_layers.0.bias | Grad Mean: 0.122643 | Grad Max: 0.775028 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001909 | Grad Max: 0.009879 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010264 | Grad Max: 0.010264 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001062 | Grad Max: 0.053038 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020077 | Grad Max: 0.305860 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000280 | Grad Max: 0.006432 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009534 | Grad Max: 0.030597 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000744 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002326 | Grad Max: 0.006118 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000409 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000908 | Grad Max: 0.002493 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002538 | Grad Max: 0.005158 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033667 | Grad Max: 0.033667 [GRADIENT NORM TOTAL] 2.5723 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.234 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5030059 0.49699402] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.009 [MASKS] A(Pass/Fail): 1/2047 | B: 134/1914 | C: 235/1813 [LOSS Ex1] A: 0.68301 | B: 0.68486 | C: 0.68070 [LOGITS Ex2 A] Mean Abs: 0.978 | Max: 4.597 [LOSS Ex2] A: 0.37037 | B: 0.46054 | C: 0.44665 ** [JOINT LOSS] ** : 1.108714 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002463 | Grad Max: 0.087341 -> Layer: shared_layers.0.bias | Grad Mean: 0.139236 | Grad Max: 0.880457 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.010799 -> Layer: exit1_layers.0.bias | Grad Mean: 0.021632 | Grad Max: 0.021632 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001209 | Grad Max: 0.063758 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022721 | Grad Max: 0.343404 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000304 | Grad Max: 0.007752 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010300 | Grad Max: 0.037712 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000828 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002496 | Grad Max: 0.006091 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000507 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000988 | Grad Max: 0.003056 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002878 | Grad Max: 0.006105 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037683 | Grad Max: 0.037683 [GRADIENT NORM TOTAL] 2.9228 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.239 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50830007 0.4916999 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 5/2043 | B: 147/1901 | C: 206/1842 [LOSS Ex1] A: 0.68212 | B: 0.68512 | C: 0.68151 [LOGITS Ex2 A] Mean Abs: 0.991 | Max: 4.866 [LOSS Ex2] A: 0.39510 | B: 0.44359 | C: 0.42008 ** [JOINT LOSS] ** : 1.102508 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003406 | Grad Max: 0.116625 -> Layer: shared_layers.0.bias | Grad Mean: 0.039946 | Grad Max: 0.183652 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001920 | Grad Max: 0.009907 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009543 | Grad Max: 0.009543 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000544 | Grad Max: 0.025601 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007185 | Grad Max: 0.139218 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.002582 -> Layer: exit2_layers.3.bias | Grad Mean: 0.000997 | Grad Max: 0.012667 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000223 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000150 | Grad Max: 0.000948 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000103 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000059 | Grad Max: 0.000375 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000325 | Grad Max: 0.001342 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001862 | Grad Max: 0.001862 [GRADIENT NORM TOTAL] 1.0289 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.211 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50992167 0.4900783 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 2/2046 | B: 130/1918 | C: 223/1825 [LOSS Ex1] A: 0.68370 | B: 0.68331 | C: 0.68129 [LOGITS Ex2 A] Mean Abs: 1.018 | Max: 4.685 [LOSS Ex2] A: 0.39000 | B: 0.42877 | C: 0.42418 ** [JOINT LOSS] ** : 1.097086 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003957 | Grad Max: 0.095260 -> Layer: shared_layers.0.bias | Grad Mean: 0.166597 | Grad Max: 1.015105 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001823 | Grad Max: 0.008802 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007839 | Grad Max: 0.007839 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.080671 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027312 | Grad Max: 0.458161 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.008843 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012182 | Grad Max: 0.043272 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000923 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002961 | Grad Max: 0.006588 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000516 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001147 | Grad Max: 0.003237 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002994 | Grad Max: 0.006482 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040468 | Grad Max: 0.040468 [GRADIENT NORM TOTAL] 3.3973 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.151 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50582075 0.49417922] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.008 [MASKS] A(Pass/Fail): 2/2046 | B: 137/1719 | C: 208/1840 [LOSS Ex1] A: 0.68331 | B: 0.68506 | C: 0.68142 [LOGITS Ex2 A] Mean Abs: 0.993 | Max: 4.457 [LOSS Ex2] A: 0.37589 | B: 0.42330 | C: 0.41235 ** [JOINT LOSS] ** : 1.087109 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002613 | Grad Max: 0.050132 -> Layer: shared_layers.0.bias | Grad Mean: 0.081399 | Grad Max: 0.483575 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001730 | Grad Max: 0.008501 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001682 | Grad Max: 0.001682 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000761 | Grad Max: 0.038403 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013864 | Grad Max: 0.216687 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000185 | Grad Max: 0.004824 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006145 | Grad Max: 0.024005 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000539 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001475 | Grad Max: 0.003790 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000310 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000577 | Grad Max: 0.001775 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001568 | Grad Max: 0.004366 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020926 | Grad Max: 0.020926 [GRADIENT NORM TOTAL] 1.6861 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.217 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53015053 0.4698495 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 1/2047 | B: 137/1911 | C: 222/1826 [LOSS Ex1] A: 0.68299 | B: 0.68471 | C: 0.68203 [LOGITS Ex2 A] Mean Abs: 1.004 | Max: 4.778 [LOSS Ex2] A: 0.38918 | B: 0.46156 | C: 0.44097 ** [JOINT LOSS] ** : 1.113813 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003982 | Grad Max: 0.119144 -> Layer: shared_layers.0.bias | Grad Mean: 0.092328 | Grad Max: 0.508912 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.010560 -> Layer: exit1_layers.0.bias | Grad Mean: 0.021617 | Grad Max: 0.021617 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000905 | Grad Max: 0.055473 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016639 | Grad Max: 0.282534 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000217 | Grad Max: 0.006116 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007255 | Grad Max: 0.028071 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000608 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001780 | Grad Max: 0.004266 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000353 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000701 | Grad Max: 0.002124 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002072 | Grad Max: 0.004487 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026428 | Grad Max: 0.026428 [GRADIENT NORM TOTAL] 1.9907 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.244 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062049 0.49379513] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 3/2045 | B: 148/1900 | C: 255/1793 [LOSS Ex1] A: 0.68425 | B: 0.68498 | C: 0.67830 [LOGITS Ex2 A] Mean Abs: 1.014 | Max: 4.611 [LOSS Ex2] A: 0.37593 | B: 0.43827 | C: 0.40569 ** [JOINT LOSS] ** : 1.089138 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005411 | Grad Max: 0.161034 -> Layer: shared_layers.0.bias | Grad Mean: 0.111527 | Grad Max: 0.611851 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001802 | Grad Max: 0.008470 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005857 | Grad Max: 0.005857 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001143 | Grad Max: 0.052000 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020472 | Grad Max: 0.244815 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000273 | Grad Max: 0.006894 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008929 | Grad Max: 0.034655 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000730 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002187 | Grad Max: 0.005439 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000386 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000850 | Grad Max: 0.002385 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002679 | Grad Max: 0.005682 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032236 | Grad Max: 0.032236 [GRADIENT NORM TOTAL] 2.4034 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.090 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5308137 0.46918628] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 0/2048 | B: 133/1915 | C: 242/1806 [LOSS Ex1] A: 0.00000 | B: 0.68316 | C: 0.68013 [LOGITS Ex2 A] Mean Abs: 1.027 | Max: 4.585 [LOSS Ex2] A: 0.37872 | B: 0.43486 | C: 0.40746 ** [JOINT LOSS] ** : 0.861442 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002230 | Grad Max: 0.064303 -> Layer: shared_layers.0.bias | Grad Mean: 0.086425 | Grad Max: 0.496068 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001219 | Grad Max: 0.005246 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011295 | Grad Max: 0.011295 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000768 | Grad Max: 0.052432 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013424 | Grad Max: 0.298382 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000145 | Grad Max: 0.004095 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004664 | Grad Max: 0.019703 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000445 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001082 | Grad Max: 0.002878 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000222 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000419 | Grad Max: 0.001319 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000898 | Grad Max: 0.002653 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014665 | Grad Max: 0.014665 [GRADIENT NORM TOTAL] 1.8033 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.131 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5160247 0.4839753] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 1/1615 | B: 137/1719 | C: 155/1221 [LOSS Ex1] A: 0.68253 | B: 0.68492 | C: 0.68039 [LOGITS Ex2 A] Mean Abs: 1.085 | Max: 4.679 [LOSS Ex2] A: 0.36566 | B: 0.42323 | C: 0.44127 ** [JOINT LOSS] ** : 1.092666 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002660 | Grad Max: 0.052425 -> Layer: shared_layers.0.bias | Grad Mean: 0.062812 | Grad Max: 0.343283 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001889 | Grad Max: 0.009257 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002320 | Grad Max: 0.002320 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000572 | Grad Max: 0.032207 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009404 | Grad Max: 0.180940 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000106 | Grad Max: 0.003854 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003421 | Grad Max: 0.019334 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000375 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000801 | Grad Max: 0.002500 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000224 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000308 | Grad Max: 0.001305 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000696 | Grad Max: 0.002680 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010857 | Grad Max: 0.010857 [GRADIENT NORM TOTAL] 1.2379 [EPOCH SUMMARY] Train Loss: 1.0494 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0679 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0761 -> New: 1.0679) ############################## EPOCH 19/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.246 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5096955 0.49030453] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 2/2046 | B: 138/1910 | C: 219/1829 [LOSS Ex1] A: 0.68292 | B: 0.68457 | C: 0.68114 [LOGITS Ex2 A] Mean Abs: 1.040 | Max: 5.381 [LOSS Ex2] A: 0.35619 | B: 0.45310 | C: 0.42314 ** [JOINT LOSS] ** : 1.093686 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001375 | Grad Max: 0.034492 -> Layer: shared_layers.0.bias | Grad Mean: 0.018601 | Grad Max: 0.109097 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001971 | Grad Max: 0.010106 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017811 | Grad Max: 0.017811 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000230 | Grad Max: 0.017639 -> Layer: exit2_layers.0.bias | Grad Mean: 0.003598 | Grad Max: 0.097229 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002156 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001219 | Grad Max: 0.007355 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000231 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000280 | Grad Max: 0.001556 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000143 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000105 | Grad Max: 0.000498 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000447 | Grad Max: 0.001705 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003808 | Grad Max: 0.003808 [GRADIENT NORM TOTAL] 0.4705 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.238 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50266856 0.4973315 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 1/2047 | B: 149/1899 | C: 236/1812 [LOSS Ex1] A: 0.68270 | B: 0.68484 | C: 0.67977 [LOGITS Ex2 A] Mean Abs: 1.005 | Max: 4.552 [LOSS Ex2] A: 0.37148 | B: 0.44475 | C: 0.42521 ** [JOINT LOSS] ** : 1.096254 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001493 | Grad Max: 0.055633 -> Layer: shared_layers.0.bias | Grad Mean: 0.068249 | Grad Max: 0.437536 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001945 | Grad Max: 0.009827 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009703 | Grad Max: 0.009703 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000619 | Grad Max: 0.031182 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011314 | Grad Max: 0.167805 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000154 | Grad Max: 0.005036 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005231 | Grad Max: 0.023848 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000487 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001243 | Grad Max: 0.003389 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000228 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000478 | Grad Max: 0.001457 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001321 | Grad Max: 0.003488 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017882 | Grad Max: 0.017882 [GRADIENT NORM TOTAL] 1.4827 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.243 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080542 0.4919458] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 5/2043 | B: 134/1914 | C: 218/1830 [LOSS Ex1] A: 0.68181 | B: 0.68300 | C: 0.68030 [LOGITS Ex2 A] Mean Abs: 1.032 | Max: 4.886 [LOSS Ex2] A: 0.39412 | B: 0.42534 | C: 0.42032 ** [JOINT LOSS] ** : 1.094965 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004972 | Grad Max: 0.171887 -> Layer: shared_layers.0.bias | Grad Mean: 0.083107 | Grad Max: 0.388099 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.010329 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009804 | Grad Max: 0.009804 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000898 | Grad Max: 0.048274 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015639 | Grad Max: 0.227361 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000195 | Grad Max: 0.005055 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006279 | Grad Max: 0.023733 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000592 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001536 | Grad Max: 0.004375 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000333 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000586 | Grad Max: 0.001970 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001615 | Grad Max: 0.003499 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020425 | Grad Max: 0.020425 [GRADIENT NORM TOTAL] 1.8932 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.214 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50973535 0.49026462] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 2/2046 | B: 137/1719 | C: 226/1822 [LOSS Ex1] A: 0.68347 | B: 0.68477 | C: 0.68155 [LOGITS Ex2 A] Mean Abs: 1.005 | Max: 4.636 [LOSS Ex2] A: 0.37940 | B: 0.42830 | C: 0.42335 ** [JOINT LOSS] ** : 1.093611 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002408 | Grad Max: 0.065587 -> Layer: shared_layers.0.bias | Grad Mean: 0.026533 | Grad Max: 0.098073 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001709 | Grad Max: 0.008469 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006808 | Grad Max: 0.006808 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000340 | Grad Max: 0.020708 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005396 | Grad Max: 0.106076 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.002377 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001741 | Grad Max: 0.010403 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000233 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000427 | Grad Max: 0.001779 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000177 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000162 | Grad Max: 0.000880 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000430 | Grad Max: 0.001805 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005079 | Grad Max: 0.005079 [GRADIENT NORM TOTAL] 0.6762 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.031 | Max: 0.153 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063159 0.49368408] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.511 | Std: 0.009 [MASKS] A(Pass/Fail): 2/2046 | B: 139/1909 | C: 247/1801 [LOSS Ex1] A: 0.68307 | B: 0.68442 | C: 0.67885 [LOGITS Ex2 A] Mean Abs: 0.978 | Max: 4.585 [LOSS Ex2] A: 0.37126 | B: 0.45644 | C: 0.42247 ** [JOINT LOSS] ** : 1.098837 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.063346 -> Layer: shared_layers.0.bias | Grad Mean: 0.116037 | Grad Max: 0.719553 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001857 | Grad Max: 0.008649 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000334 | Grad Max: 0.000334 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001013 | Grad Max: 0.079399 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018775 | Grad Max: 0.448394 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000246 | Grad Max: 0.006095 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008430 | Grad Max: 0.030593 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000583 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002017 | Grad Max: 0.004802 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000337 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000773 | Grad Max: 0.002224 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002067 | Grad Max: 0.004463 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027726 | Grad Max: 0.027726 [GRADIENT NORM TOTAL] 2.4608 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.221 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5314298 0.46857014] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 1/2047 | B: 149/1899 | C: 219/1829 [LOSS Ex1] A: 0.68269 | B: 0.68469 | C: 0.68192 [LOGITS Ex2 A] Mean Abs: 1.012 | Max: 4.589 [LOSS Ex2] A: 0.35782 | B: 0.43318 | C: 0.43366 ** [JOINT LOSS] ** : 1.091318 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001861 | Grad Max: 0.046363 -> Layer: shared_layers.0.bias | Grad Mean: 0.051421 | Grad Max: 0.287754 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001952 | Grad Max: 0.009855 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016739 | Grad Max: 0.016739 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000498 | Grad Max: 0.030112 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008935 | Grad Max: 0.170696 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.003423 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003963 | Grad Max: 0.017181 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000465 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000953 | Grad Max: 0.002696 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000234 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000366 | Grad Max: 0.001190 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001162 | Grad Max: 0.003173 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014220 | Grad Max: 0.014220 [GRADIENT NORM TOTAL] 1.0728 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.249 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50600255 0.49399745] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 3/2045 | B: 135/1913 | C: 252/1796 [LOSS Ex1] A: 0.68400 | B: 0.68285 | C: 0.67995 [LOGITS Ex2 A] Mean Abs: 1.053 | Max: 4.235 [LOSS Ex2] A: 0.37184 | B: 0.44861 | C: 0.42255 ** [JOINT LOSS] ** : 1.096598 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003248 | Grad Max: 0.113707 -> Layer: shared_layers.0.bias | Grad Mean: 0.227232 | Grad Max: 1.428709 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001898 | Grad Max: 0.009293 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013367 | Grad Max: 0.013367 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001798 | Grad Max: 0.095065 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034770 | Grad Max: 0.539935 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.011843 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016553 | Grad Max: 0.062107 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001237 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003913 | Grad Max: 0.009651 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000716 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001487 | Grad Max: 0.004203 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003793 | Grad Max: 0.007653 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053146 | Grad Max: 0.053146 [GRADIENT NORM TOTAL] 4.6962 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.093 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5320062 0.46799383] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 0/2048 | B: 138/1718 | C: 248/1800 [LOSS Ex1] A: 0.00000 | B: 0.68463 | C: 0.67896 [LOGITS Ex2 A] Mean Abs: 1.079 | Max: 4.524 [LOSS Ex2] A: 0.39363 | B: 0.44457 | C: 0.41547 ** [JOINT LOSS] ** : 0.872424 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004290 | Grad Max: 0.123652 -> Layer: shared_layers.0.bias | Grad Mean: 0.236934 | Grad Max: 1.456883 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001161 | Grad Max: 0.005305 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009442 | Grad Max: 0.009442 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001965 | Grad Max: 0.108264 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037340 | Grad Max: 0.601556 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000510 | Grad Max: 0.012307 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017491 | Grad Max: 0.064642 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000098 | Grad Max: 0.001212 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004144 | Grad Max: 0.009639 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000638 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001579 | Grad Max: 0.004233 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004242 | Grad Max: 0.008531 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057470 | Grad Max: 0.057470 [GRADIENT NORM TOTAL] 4.8734 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.134 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51689625 0.48310378] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 1/1615 | B: 140/1908 | C: 244/1804 [LOSS Ex1] A: 0.68224 | B: 0.68428 | C: 0.68044 [LOGITS Ex2 A] Mean Abs: 1.082 | Max: 4.836 [LOSS Ex2] A: 0.34554 | B: 0.45815 | C: 0.40775 ** [JOINT LOSS] ** : 1.086133 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.044557 -> Layer: shared_layers.0.bias | Grad Mean: 0.089982 | Grad Max: 0.498503 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.009850 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010445 | Grad Max: 0.010445 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000743 | Grad Max: 0.044633 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013359 | Grad Max: 0.246807 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.005117 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005693 | Grad Max: 0.024139 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000516 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001311 | Grad Max: 0.003579 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000223 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000500 | Grad Max: 0.001523 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001077 | Grad Max: 0.003088 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017377 | Grad Max: 0.017377 [GRADIENT NORM TOTAL] 1.7899 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.251 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50957453 0.49042544] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 2/2046 | B: 149/1899 | C: 215/1833 [LOSS Ex1] A: 0.68262 | B: 0.68456 | C: 0.68005 [LOGITS Ex2 A] Mean Abs: 1.054 | Max: 4.551 [LOSS Ex2] A: 0.37367 | B: 0.44152 | C: 0.44570 ** [JOINT LOSS] ** : 1.102707 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006315 | Grad Max: 0.154545 -> Layer: shared_layers.0.bias | Grad Mean: 0.197956 | Grad Max: 1.145364 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001869 | Grad Max: 0.009291 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010106 | Grad Max: 0.010106 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001823 | Grad Max: 0.083490 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033943 | Grad Max: 0.433136 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.010533 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015491 | Grad Max: 0.054168 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001044 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003705 | Grad Max: 0.008505 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000591 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001415 | Grad Max: 0.003758 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004003 | Grad Max: 0.007356 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052040 | Grad Max: 0.052040 [GRADIENT NORM TOTAL] 4.0813 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.242 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023866 0.4976134] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 1/2047 | B: 136/1912 | C: 231/1817 [LOSS Ex1] A: 0.68238 | B: 0.68271 | C: 0.68122 [LOGITS Ex2 A] Mean Abs: 1.040 | Max: 5.081 [LOSS Ex2] A: 0.37993 | B: 0.43511 | C: 0.43448 ** [JOINT LOSS] ** : 1.098608 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006336 | Grad Max: 0.162087 -> Layer: shared_layers.0.bias | Grad Mean: 0.243172 | Grad Max: 1.404818 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.010726 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019589 | Grad Max: 0.019589 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.106992 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040077 | Grad Max: 0.586746 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000542 | Grad Max: 0.012120 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018336 | Grad Max: 0.062320 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000105 | Grad Max: 0.001289 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004385 | Grad Max: 0.011009 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.000689 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001666 | Grad Max: 0.004495 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004516 | Grad Max: 0.008945 -> Layer: exit2_layers.12.bias | Grad Mean: 0.059878 | Grad Max: 0.059878 [GRADIENT NORM TOTAL] 4.9687 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.247 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50785494 0.49214506] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 5/2043 | B: 140/1716 | C: 244/1804 [LOSS Ex1] A: 0.68149 | B: 0.68451 | C: 0.67924 [LOGITS Ex2 A] Mean Abs: 1.044 | Max: 4.997 [LOSS Ex2] A: 0.37454 | B: 0.42681 | C: 0.42622 ** [JOINT LOSS] ** : 1.090936 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002957 | Grad Max: 0.072199 -> Layer: shared_layers.0.bias | Grad Mean: 0.145065 | Grad Max: 0.835343 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001973 | Grad Max: 0.009899 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009021 | Grad Max: 0.009021 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001214 | Grad Max: 0.060798 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022776 | Grad Max: 0.346510 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000318 | Grad Max: 0.006956 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010921 | Grad Max: 0.037715 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000842 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002608 | Grad Max: 0.006154 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000412 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000998 | Grad Max: 0.002690 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002773 | Grad Max: 0.005749 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037081 | Grad Max: 0.037081 [GRADIENT NORM TOTAL] 2.9533 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.217 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50958365 0.49041632] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 2/2046 | B: 142/1906 | C: 221/1827 [LOSS Ex1] A: 0.68323 | B: 0.68416 | C: 0.68065 [LOGITS Ex2 A] Mean Abs: 1.061 | Max: 4.774 [LOSS Ex2] A: 0.37540 | B: 0.46054 | C: 0.42869 ** [JOINT LOSS] ** : 1.104222 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003385 | Grad Max: 0.092664 -> Layer: shared_layers.0.bias | Grad Mean: 0.176653 | Grad Max: 1.052156 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001779 | Grad Max: 0.008624 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008311 | Grad Max: 0.008311 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001432 | Grad Max: 0.069732 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027795 | Grad Max: 0.393444 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000382 | Grad Max: 0.008707 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013104 | Grad Max: 0.045341 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000914 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003055 | Grad Max: 0.007395 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000477 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001150 | Grad Max: 0.003183 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002813 | Grad Max: 0.006623 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039974 | Grad Max: 0.039974 [GRADIENT NORM TOTAL] 3.5599 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.154 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.506802 0.493198] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.009 [MASKS] A(Pass/Fail): 2/2046 | B: 149/1899 | C: 155/1221 [LOSS Ex1] A: 0.68284 | B: 0.68445 | C: 0.67971 [LOGITS Ex2 A] Mean Abs: 1.056 | Max: 4.788 [LOSS Ex2] A: 0.38368 | B: 0.46178 | C: 0.42245 ** [JOINT LOSS] ** : 1.104968 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006473 | Grad Max: 0.171747 -> Layer: shared_layers.0.bias | Grad Mean: 0.269928 | Grad Max: 1.597622 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001804 | Grad Max: 0.008786 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003781 | Grad Max: 0.003781 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.108202 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043983 | Grad Max: 0.611451 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000591 | Grad Max: 0.015925 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020170 | Grad Max: 0.077199 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000113 | Grad Max: 0.001517 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004769 | Grad Max: 0.011849 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000061 | Grad Max: 0.000791 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001802 | Grad Max: 0.005029 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004748 | Grad Max: 0.009739 -> Layer: exit2_layers.12.bias | Grad Mean: 0.063938 | Grad Max: 0.063938 [GRADIENT NORM TOTAL] 5.4609 [EPOCH SUMMARY] Train Loss: 1.0804 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0679 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0679 -> New: 1.0679) ############################## EPOCH 20/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.225 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5326474 0.4673526] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 1/2047 | B: 136/1912 | C: 251/1797 [LOSS Ex1] A: 0.68241 | B: 0.68259 | C: 0.67897 [LOGITS Ex2 A] Mean Abs: 1.070 | Max: 4.965 [LOSS Ex2] A: 0.36606 | B: 0.42882 | C: 0.39508 ** [JOINT LOSS] ** : 1.077975 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003845 | Grad Max: 0.108094 -> Layer: shared_layers.0.bias | Grad Mean: 0.152221 | Grad Max: 0.912392 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.010569 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018771 | Grad Max: 0.018771 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001294 | Grad Max: 0.065392 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024828 | Grad Max: 0.370909 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000330 | Grad Max: 0.008556 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011393 | Grad Max: 0.041198 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000917 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002706 | Grad Max: 0.006724 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000413 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001030 | Grad Max: 0.002928 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002630 | Grad Max: 0.006100 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036922 | Grad Max: 0.036922 [GRADIENT NORM TOTAL] 3.1025 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.253 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5058149 0.4941851] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 3/2045 | B: 141/1715 | C: 235/1813 [LOSS Ex1] A: 0.68378 | B: 0.68441 | C: 0.67853 [LOGITS Ex2 A] Mean Abs: 1.033 | Max: 4.304 [LOSS Ex2] A: 0.34848 | B: 0.42150 | C: 0.44147 ** [JOINT LOSS] ** : 1.086058 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003666 | Grad Max: 0.100122 -> Layer: shared_layers.0.bias | Grad Mean: 0.156550 | Grad Max: 0.931207 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001781 | Grad Max: 0.008550 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005516 | Grad Max: 0.005516 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001348 | Grad Max: 0.067403 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024855 | Grad Max: 0.374202 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000340 | Grad Max: 0.007774 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011647 | Grad Max: 0.039715 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000850 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002777 | Grad Max: 0.006745 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000497 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001052 | Grad Max: 0.003049 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002859 | Grad Max: 0.005787 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038174 | Grad Max: 0.038174 [GRADIENT NORM TOTAL] 3.2255 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.095 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53302664 0.4669734 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 0/2048 | B: 143/1905 | C: 212/1836 [LOSS Ex1] A: 0.00000 | B: 0.68406 | C: 0.68176 [LOGITS Ex2 A] Mean Abs: 1.007 | Max: 4.437 [LOSS Ex2] A: 0.36836 | B: 0.46774 | C: 0.45093 ** [JOINT LOSS] ** : 0.884282 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003553 | Grad Max: 0.103743 -> Layer: shared_layers.0.bias | Grad Mean: 0.210486 | Grad Max: 1.246884 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001247 | Grad Max: 0.004795 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017403 | Grad Max: 0.017403 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001728 | Grad Max: 0.093486 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032724 | Grad Max: 0.519659 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000451 | Grad Max: 0.010704 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015640 | Grad Max: 0.053806 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.001113 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003733 | Grad Max: 0.008774 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000614 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001427 | Grad Max: 0.004055 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003756 | Grad Max: 0.007736 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052019 | Grad Max: 0.052019 [GRADIENT NORM TOTAL] 4.3326 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.137 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51760423 0.48239574] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 1/1615 | B: 149/1899 | C: 246/1802 [LOSS Ex1] A: 0.68202 | B: 0.68435 | C: 0.67861 [LOGITS Ex2 A] Mean Abs: 1.049 | Max: 4.916 [LOSS Ex2] A: 0.35876 | B: 0.43080 | C: 0.40205 ** [JOINT LOSS] ** : 1.078864 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001900 | Grad Max: 0.055545 -> Layer: shared_layers.0.bias | Grad Mean: 0.100777 | Grad Max: 0.596611 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001950 | Grad Max: 0.009138 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002988 | Grad Max: 0.002988 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000885 | Grad Max: 0.060917 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016317 | Grad Max: 0.340202 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000225 | Grad Max: 0.006679 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007756 | Grad Max: 0.035238 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000672 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001847 | Grad Max: 0.004711 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000322 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000695 | Grad Max: 0.001937 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001900 | Grad Max: 0.004735 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025217 | Grad Max: 0.025217 [GRADIENT NORM TOTAL] 2.1545 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.254 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5094613 0.49053872] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 2/2046 | B: 136/1912 | C: 263/1785 [LOSS Ex1] A: 0.68240 | B: 0.68249 | C: 0.67841 [LOGITS Ex2 A] Mean Abs: 1.096 | Max: 5.266 [LOSS Ex2] A: 0.38301 | B: 0.42916 | C: 0.42568 ** [JOINT LOSS] ** : 1.093714 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005415 | Grad Max: 0.151652 -> Layer: shared_layers.0.bias | Grad Mean: 0.198683 | Grad Max: 1.154045 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.009614 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009021 | Grad Max: 0.009021 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001795 | Grad Max: 0.082295 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033927 | Grad Max: 0.460100 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000450 | Grad Max: 0.011645 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015330 | Grad Max: 0.057780 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.001056 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003629 | Grad Max: 0.008948 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000570 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001374 | Grad Max: 0.003612 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003636 | Grad Max: 0.007991 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048906 | Grad Max: 0.048906 [GRADIENT NORM TOTAL] 4.1166 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.245 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50215775 0.49784225] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 1/2047 | B: 142/1714 | C: 231/1817 [LOSS Ex1] A: 0.68215 | B: 0.68432 | C: 0.67997 [LOGITS Ex2 A] Mean Abs: 1.094 | Max: 4.248 [LOSS Ex2] A: 0.38263 | B: 0.44352 | C: 0.41697 ** [JOINT LOSS] ** : 1.096519 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006393 | Grad Max: 0.170892 -> Layer: shared_layers.0.bias | Grad Mean: 0.257339 | Grad Max: 1.557462 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.010697 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017794 | Grad Max: 0.017794 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.105972 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043141 | Grad Max: 0.592697 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000585 | Grad Max: 0.013228 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020073 | Grad Max: 0.068517 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001304 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004748 | Grad Max: 0.010821 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000059 | Grad Max: 0.000722 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001791 | Grad Max: 0.004967 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004660 | Grad Max: 0.010094 -> Layer: exit2_layers.12.bias | Grad Mean: 0.063788 | Grad Max: 0.063788 [GRADIENT NORM TOTAL] 5.2771 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.250 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5077141 0.49228588] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 5/2043 | B: 143/1905 | C: 243/1805 [LOSS Ex1] A: 0.68126 | B: 0.68397 | C: 0.68203 [LOGITS Ex2 A] Mean Abs: 1.063 | Max: 4.721 [LOSS Ex2] A: 0.36639 | B: 0.45138 | C: 0.42704 ** [JOINT LOSS] ** : 1.097353 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004829 | Grad Max: 0.122097 -> Layer: shared_layers.0.bias | Grad Mean: 0.141521 | Grad Max: 0.802264 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.010896 -> Layer: exit1_layers.0.bias | Grad Mean: 0.021480 | Grad Max: 0.021480 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001312 | Grad Max: 0.057534 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024331 | Grad Max: 0.320149 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000322 | Grad Max: 0.009461 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010850 | Grad Max: 0.044149 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000792 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002572 | Grad Max: 0.006560 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000438 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000972 | Grad Max: 0.002688 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002492 | Grad Max: 0.005100 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035039 | Grad Max: 0.035039 [GRADIENT NORM TOTAL] 2.8969 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.220 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50947654 0.4905234 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 2/2046 | B: 149/1899 | C: 228/1820 [LOSS Ex1] A: 0.68305 | B: 0.68427 | C: 0.67983 [LOGITS Ex2 A] Mean Abs: 1.001 | Max: 4.605 [LOSS Ex2] A: 0.37208 | B: 0.44451 | C: 0.42279 ** [JOINT LOSS] ** : 1.095510 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004545 | Grad Max: 0.131208 -> Layer: shared_layers.0.bias | Grad Mean: 0.186119 | Grad Max: 1.093284 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001756 | Grad Max: 0.008378 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005130 | Grad Max: 0.005130 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001622 | Grad Max: 0.099009 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030497 | Grad Max: 0.538911 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000409 | Grad Max: 0.009521 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013975 | Grad Max: 0.054979 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001053 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003332 | Grad Max: 0.008580 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000481 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001255 | Grad Max: 0.003258 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003378 | Grad Max: 0.006791 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045023 | Grad Max: 0.045023 [GRADIENT NORM TOTAL] 3.8135 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.032 | Max: 0.156 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071312 0.49286878] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 2/2046 | B: 136/1912 | C: 228/1820 [LOSS Ex1] A: 0.68265 | B: 0.68240 | C: 0.68059 [LOGITS Ex2 A] Mean Abs: 0.980 | Max: 4.393 [LOSS Ex2] A: 0.36636 | B: 0.43730 | C: 0.43643 ** [JOINT LOSS] ** : 1.095241 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006261 | Grad Max: 0.172233 -> Layer: shared_layers.0.bias | Grad Mean: 0.274738 | Grad Max: 1.666761 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001831 | Grad Max: 0.008735 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004383 | Grad Max: 0.004383 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002386 | Grad Max: 0.127585 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045346 | Grad Max: 0.692875 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.017136 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021005 | Grad Max: 0.085723 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000116 | Grad Max: 0.001389 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004967 | Grad Max: 0.011233 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000062 | Grad Max: 0.000730 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001880 | Grad Max: 0.005033 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004960 | Grad Max: 0.009934 -> Layer: exit2_layers.12.bias | Grad Mean: 0.067569 | Grad Max: 0.067569 [GRADIENT NORM TOTAL] 5.6749 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.227 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5335702 0.46642986] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.010 [MASKS] A(Pass/Fail): 1/2047 | B: 142/1714 | C: 216/1832 [LOSS Ex1] A: 0.68218 | B: 0.68424 | C: 0.68101 [LOGITS Ex2 A] Mean Abs: 1.035 | Max: 4.798 [LOSS Ex2] A: 0.35380 | B: 0.43311 | C: 0.45605 ** [JOINT LOSS] ** : 1.096797 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004098 | Grad Max: 0.105308 -> Layer: shared_layers.0.bias | Grad Mean: 0.168035 | Grad Max: 0.981702 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.010670 -> Layer: exit1_layers.0.bias | Grad Mean: 0.021004 | Grad Max: 0.021004 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001457 | Grad Max: 0.073461 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027022 | Grad Max: 0.404966 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000365 | Grad Max: 0.009372 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012445 | Grad Max: 0.044669 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000847 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002949 | Grad Max: 0.007131 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000504 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001119 | Grad Max: 0.003107 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002957 | Grad Max: 0.005999 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039673 | Grad Max: 0.039673 [GRADIENT NORM TOTAL] 3.4457 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.256 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50569296 0.494307 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 3/2045 | B: 144/1904 | C: 235/1813 [LOSS Ex1] A: 0.68361 | B: 0.68389 | C: 0.67995 [LOGITS Ex2 A] Mean Abs: 1.058 | Max: 4.517 [LOSS Ex2] A: 0.35956 | B: 0.45905 | C: 0.39264 ** [JOINT LOSS] ** : 1.086233 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001804 | Grad Max: 0.048241 -> Layer: shared_layers.0.bias | Grad Mean: 0.111403 | Grad Max: 0.656656 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001755 | Grad Max: 0.008401 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007093 | Grad Max: 0.007093 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000895 | Grad Max: 0.048810 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016652 | Grad Max: 0.272283 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000227 | Grad Max: 0.006306 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007822 | Grad Max: 0.029115 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000611 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001820 | Grad Max: 0.004955 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000320 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000692 | Grad Max: 0.002000 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001604 | Grad Max: 0.003700 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024632 | Grad Max: 0.024632 [GRADIENT NORM TOTAL] 2.3045 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.097 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5338051 0.46619493] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 0/2048 | B: 150/1898 | C: 235/1813 [LOSS Ex1] A: 0.00000 | B: 0.68419 | C: 0.67933 [LOGITS Ex2 A] Mean Abs: 1.072 | Max: 4.435 [LOSS Ex2] A: 0.36685 | B: 0.44868 | C: 0.40327 ** [JOINT LOSS] ** : 0.860774 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003721 | Grad Max: 0.084158 -> Layer: shared_layers.0.bias | Grad Mean: 0.167985 | Grad Max: 1.020169 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001118 | Grad Max: 0.005378 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008691 | Grad Max: 0.008691 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001416 | Grad Max: 0.079536 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026776 | Grad Max: 0.448562 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000360 | Grad Max: 0.008011 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012440 | Grad Max: 0.040228 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000898 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002946 | Grad Max: 0.007362 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000477 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001117 | Grad Max: 0.003257 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002783 | Grad Max: 0.005783 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039245 | Grad Max: 0.039245 [GRADIENT NORM TOTAL] 3.5010 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.139 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51816875 0.48183125] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.010 [MASKS] A(Pass/Fail): 1/1615 | B: 136/1912 | C: 220/1828 [LOSS Ex1] A: 0.68184 | B: 0.68232 | C: 0.67998 [LOGITS Ex2 A] Mean Abs: 1.084 | Max: 4.716 [LOSS Ex2] A: 0.35506 | B: 0.42589 | C: 0.42415 ** [JOINT LOSS] ** : 1.083076 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001684 | Grad Max: 0.025734 -> Layer: shared_layers.0.bias | Grad Mean: 0.046365 | Grad Max: 0.235782 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.008927 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000621 | Grad Max: 0.000621 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000408 | Grad Max: 0.026088 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007313 | Grad Max: 0.139165 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000095 | Grad Max: 0.003775 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003145 | Grad Max: 0.016575 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000311 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000739 | Grad Max: 0.002178 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000161 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000287 | Grad Max: 0.001017 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000728 | Grad Max: 0.002491 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010689 | Grad Max: 0.010689 [GRADIENT NORM TOTAL] 0.9601 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.257 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093909 0.49060914] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.010 [MASKS] A(Pass/Fail): 2/2046 | B: 142/1714 | C: 154/1222 [LOSS Ex1] A: 0.68221 | B: 0.68416 | C: 0.68010 [LOGITS Ex2 A] Mean Abs: 1.045 | Max: 4.640 [LOSS Ex2] A: 0.37205 | B: 0.42208 | C: 0.43939 ** [JOINT LOSS] ** : 1.093327 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004130 | Grad Max: 0.104686 -> Layer: shared_layers.0.bias | Grad Mean: 0.188129 | Grad Max: 1.115011 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001857 | Grad Max: 0.009461 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011913 | Grad Max: 0.011913 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001556 | Grad Max: 0.081064 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029379 | Grad Max: 0.454788 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000400 | Grad Max: 0.010230 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013833 | Grad Max: 0.048470 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000998 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003285 | Grad Max: 0.007861 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000510 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001239 | Grad Max: 0.003345 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003116 | Grad Max: 0.006121 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043952 | Grad Max: 0.043952 [GRADIENT NORM TOTAL] 3.7850 [EPOCH SUMMARY] Train Loss: 1.0590 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0691 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 21/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.247 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.502029 0.49797106] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 1/2047 | B: 144/1904 | C: 243/1805 [LOSS Ex1] A: 0.68195 | B: 0.68380 | C: 0.67818 [LOGITS Ex2 A] Mean Abs: 1.038 | Max: 4.523 [LOSS Ex2] A: 0.36211 | B: 0.46973 | C: 0.42540 ** [JOINT LOSS] ** : 1.100387 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004909 | Grad Max: 0.122698 -> Layer: shared_layers.0.bias | Grad Mean: 0.250350 | Grad Max: 1.480775 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002006 | Grad Max: 0.009974 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009844 | Grad Max: 0.009844 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.114782 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039260 | Grad Max: 0.641793 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000538 | Grad Max: 0.012701 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018596 | Grad Max: 0.068132 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001106 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004396 | Grad Max: 0.009540 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.000688 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001671 | Grad Max: 0.004395 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004330 | Grad Max: 0.008374 -> Layer: exit2_layers.12.bias | Grad Mean: 0.060433 | Grad Max: 0.060433 [GRADIENT NORM TOTAL] 5.1026 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.253 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076179 0.49238214] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 5/2043 | B: 151/1897 | C: 219/1829 [LOSS Ex1] A: 0.68106 | B: 0.68410 | C: 0.68166 [LOGITS Ex2 A] Mean Abs: 1.039 | Max: 4.463 [LOSS Ex2] A: 0.37256 | B: 0.44190 | C: 0.42357 ** [JOINT LOSS] ** : 1.094952 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001926 | Grad Max: 0.064510 -> Layer: shared_layers.0.bias | Grad Mean: 0.102820 | Grad Max: 0.599591 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002009 | Grad Max: 0.009989 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016494 | Grad Max: 0.016494 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000838 | Grad Max: 0.043043 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015204 | Grad Max: 0.238438 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000202 | Grad Max: 0.005564 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007035 | Grad Max: 0.028406 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000581 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001662 | Grad Max: 0.003944 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000272 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000626 | Grad Max: 0.001836 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001549 | Grad Max: 0.003420 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022593 | Grad Max: 0.022593 [GRADIENT NORM TOTAL] 2.0688 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.222 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5094007 0.49059927] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 2/2046 | B: 136/1912 | C: 224/1824 [LOSS Ex1] A: 0.68290 | B: 0.68222 | C: 0.68114 [LOGITS Ex2 A] Mean Abs: 1.062 | Max: 4.690 [LOSS Ex2] A: 0.36419 | B: 0.43987 | C: 0.42208 ** [JOINT LOSS] ** : 1.090798 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004338 | Grad Max: 0.138156 -> Layer: shared_layers.0.bias | Grad Mean: 0.226274 | Grad Max: 1.349641 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001818 | Grad Max: 0.008837 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008893 | Grad Max: 0.008893 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001880 | Grad Max: 0.101695 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035758 | Grad Max: 0.579624 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000471 | Grad Max: 0.010058 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016303 | Grad Max: 0.054939 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.001145 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003846 | Grad Max: 0.009742 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000530 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001450 | Grad Max: 0.003749 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003482 | Grad Max: 0.006903 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050419 | Grad Max: 0.050419 [GRADIENT NORM TOTAL] 4.6036 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.157 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073779 0.49262208] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 2/2046 | B: 143/1713 | C: 224/1824 [LOSS Ex1] A: 0.68251 | B: 0.68407 | C: 0.67907 [LOGITS Ex2 A] Mean Abs: 1.047 | Max: 4.474 [LOSS Ex2] A: 0.38247 | B: 0.43839 | C: 0.43340 ** [JOINT LOSS] ** : 1.099967 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006495 | Grad Max: 0.173904 -> Layer: shared_layers.0.bias | Grad Mean: 0.271802 | Grad Max: 1.589790 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001787 | Grad Max: 0.008284 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000067 | Grad Max: 0.000067 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002321 | Grad Max: 0.112749 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043516 | Grad Max: 0.587782 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000578 | Grad Max: 0.014666 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019766 | Grad Max: 0.072264 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000109 | Grad Max: 0.001354 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004679 | Grad Max: 0.011204 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.000685 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001756 | Grad Max: 0.004572 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004400 | Grad Max: 0.008448 -> Layer: exit2_layers.12.bias | Grad Mean: 0.061358 | Grad Max: 0.061358 [GRADIENT NORM TOTAL] 5.4867 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.230 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5343116 0.4656884] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.010 [MASKS] A(Pass/Fail): 1/2047 | B: 144/1904 | C: 217/1831 [LOSS Ex1] A: 0.68200 | B: 0.68372 | C: 0.68031 [LOGITS Ex2 A] Mean Abs: 1.077 | Max: 4.966 [LOSS Ex2] A: 0.36482 | B: 0.45411 | C: 0.40406 ** [JOINT LOSS] ** : 1.089676 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004718 | Grad Max: 0.122908 -> Layer: shared_layers.0.bias | Grad Mean: 0.192182 | Grad Max: 1.103099 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.010403 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018416 | Grad Max: 0.018416 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001643 | Grad Max: 0.085848 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030778 | Grad Max: 0.487310 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.009196 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014014 | Grad Max: 0.047070 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.001011 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003335 | Grad Max: 0.008298 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000526 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001265 | Grad Max: 0.003365 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003200 | Grad Max: 0.006675 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045394 | Grad Max: 0.045394 [GRADIENT NORM TOTAL] 3.8870 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.259 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50561965 0.49438038] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 3/2045 | B: 151/1897 | C: 236/1812 [LOSS Ex1] A: 0.68345 | B: 0.68402 | C: 0.67923 [LOGITS Ex2 A] Mean Abs: 1.045 | Max: 4.418 [LOSS Ex2] A: 0.36277 | B: 0.44386 | C: 0.41160 ** [JOINT LOSS] ** : 1.088309 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003603 | Grad Max: 0.088777 -> Layer: shared_layers.0.bias | Grad Mean: 0.087780 | Grad Max: 0.498775 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001807 | Grad Max: 0.008815 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010823 | Grad Max: 0.010823 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000875 | Grad Max: 0.044543 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015421 | Grad Max: 0.229121 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000197 | Grad Max: 0.006124 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006706 | Grad Max: 0.028189 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000558 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001624 | Grad Max: 0.004386 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000293 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000609 | Grad Max: 0.001927 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001721 | Grad Max: 0.004258 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022238 | Grad Max: 0.022238 [GRADIENT NORM TOTAL] 1.8698 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.099 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53446394 0.4655361 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 0/2048 | B: 137/1911 | C: 240/1808 [LOSS Ex1] A: 0.00000 | B: 0.68213 | C: 0.67969 [LOGITS Ex2 A] Mean Abs: 1.031 | Max: 4.734 [LOSS Ex2] A: 0.35863 | B: 0.42954 | C: 0.41004 ** [JOINT LOSS] ** : 0.853341 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004314 | Grad Max: 0.116526 -> Layer: shared_layers.0.bias | Grad Mean: 0.155855 | Grad Max: 0.946404 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001186 | Grad Max: 0.005385 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009792 | Grad Max: 0.009792 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001453 | Grad Max: 0.104574 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026925 | Grad Max: 0.573081 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000358 | Grad Max: 0.008132 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012292 | Grad Max: 0.041375 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000794 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002936 | Grad Max: 0.006607 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000461 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001108 | Grad Max: 0.003041 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003005 | Grad Max: 0.006481 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039451 | Grad Max: 0.039451 [GRADIENT NORM TOTAL] 3.3038 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.141 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5186696 0.4813304] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.010 [MASKS] A(Pass/Fail): 1/1615 | B: 143/1713 | C: 255/1793 [LOSS Ex1] A: 0.68166 | B: 0.68399 | C: 0.67785 [LOGITS Ex2 A] Mean Abs: 1.080 | Max: 4.509 [LOSS Ex2] A: 0.35450 | B: 0.41595 | C: 0.41556 ** [JOINT LOSS] ** : 1.076504 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.045507 -> Layer: shared_layers.0.bias | Grad Mean: 0.063289 | Grad Max: 0.338868 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.009282 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001986 | Grad Max: 0.001986 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000572 | Grad Max: 0.044190 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010367 | Grad Max: 0.256501 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.004031 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004505 | Grad Max: 0.017786 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000377 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001071 | Grad Max: 0.003251 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000214 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000393 | Grad Max: 0.001289 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001018 | Grad Max: 0.003266 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013448 | Grad Max: 0.013448 [GRADIENT NORM TOTAL] 1.3402 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.260 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093554 0.49064457] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/2046 | B: 145/1903 | C: 245/1803 [LOSS Ex1] A: 0.68204 | B: 0.68363 | C: 0.67949 [LOGITS Ex2 A] Mean Abs: 1.111 | Max: 4.837 [LOSS Ex2] A: 0.37808 | B: 0.46991 | C: 0.43509 ** [JOINT LOSS] ** : 1.109416 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005647 | Grad Max: 0.142185 -> Layer: shared_layers.0.bias | Grad Mean: 0.268410 | Grad Max: 1.553959 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001970 | Grad Max: 0.010031 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015841 | Grad Max: 0.015841 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002241 | Grad Max: 0.112011 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042393 | Grad Max: 0.635080 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000567 | Grad Max: 0.012715 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019668 | Grad Max: 0.066740 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000106 | Grad Max: 0.001290 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004657 | Grad Max: 0.011002 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000056 | Grad Max: 0.000696 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001752 | Grad Max: 0.004711 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004450 | Grad Max: 0.008942 -> Layer: exit2_layers.12.bias | Grad Mean: 0.061895 | Grad Max: 0.061895 [GRADIENT NORM TOTAL] 5.4119 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.250 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50189143 0.49810863] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.010 [MASKS] A(Pass/Fail): 2/2046 | B: 151/1897 | C: 224/1824 [LOSS Ex1] A: 0.68176 | B: 0.68394 | C: 0.68111 [LOGITS Ex2 A] Mean Abs: 1.132 | Max: 4.481 [LOSS Ex2] A: 0.38579 | B: 0.46657 | C: 0.42391 ** [JOINT LOSS] ** : 1.107689 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007203 | Grad Max: 0.194034 -> Layer: shared_layers.0.bias | Grad Mean: 0.322761 | Grad Max: 1.913378 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001961 | Grad Max: 0.010180 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015147 | Grad Max: 0.015147 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002783 | Grad Max: 0.141313 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052615 | Grad Max: 0.774162 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000701 | Grad Max: 0.015651 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024247 | Grad Max: 0.084011 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000131 | Grad Max: 0.001685 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005733 | Grad Max: 0.013539 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000070 | Grad Max: 0.000845 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002160 | Grad Max: 0.005653 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005504 | Grad Max: 0.011092 -> Layer: exit2_layers.12.bias | Grad Mean: 0.075711 | Grad Max: 0.075711 [GRADIENT NORM TOTAL] 6.6261 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.256 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5075382 0.4924618] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 5/2043 | B: 141/1907 | C: 235/1813 [LOSS Ex1] A: 0.68088 | B: 0.68204 | C: 0.67985 [LOGITS Ex2 A] Mean Abs: 1.098 | Max: 4.551 [LOSS Ex2] A: 0.38065 | B: 0.43560 | C: 0.41679 ** [JOINT LOSS] ** : 1.091937 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005576 | Grad Max: 0.156752 -> Layer: shared_layers.0.bias | Grad Mean: 0.198938 | Grad Max: 1.129021 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002050 | Grad Max: 0.010182 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011534 | Grad Max: 0.011534 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001778 | Grad Max: 0.084663 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033023 | Grad Max: 0.479800 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000436 | Grad Max: 0.011860 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014943 | Grad Max: 0.058129 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.001090 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003547 | Grad Max: 0.008816 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000531 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001340 | Grad Max: 0.003504 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003464 | Grad Max: 0.006762 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047394 | Grad Max: 0.047394 [GRADIENT NORM TOTAL] 4.1132 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.224 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093507 0.4906493] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 3/2045 | B: 144/1712 | C: 251/1797 [LOSS Ex1] A: 0.68275 | B: 0.68391 | C: 0.67960 [LOGITS Ex2 A] Mean Abs: 1.028 | Max: 4.572 [LOSS Ex2] A: 0.35330 | B: 0.41448 | C: 0.40399 ** [JOINT LOSS] ** : 1.072673 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002556 | Grad Max: 0.071805 -> Layer: shared_layers.0.bias | Grad Mean: 0.078239 | Grad Max: 0.384307 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001792 | Grad Max: 0.008620 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009703 | Grad Max: 0.009703 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000706 | Grad Max: 0.038589 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012772 | Grad Max: 0.215937 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000163 | Grad Max: 0.004428 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005513 | Grad Max: 0.020549 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000482 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001311 | Grad Max: 0.003756 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000262 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000486 | Grad Max: 0.001619 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001407 | Grad Max: 0.004608 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017262 | Grad Max: 0.017262 [GRADIENT NORM TOTAL] 1.5786 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.033 | Max: 0.159 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076469 0.49235314] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 3/2045 | B: 148/1900 | C: 252/1796 [LOSS Ex1] A: 0.68235 | B: 0.68355 | C: 0.67796 [LOGITS Ex2 A] Mean Abs: 1.003 | Max: 4.603 [LOSS Ex2] A: 0.35678 | B: 0.45420 | C: 0.41041 ** [JOINT LOSS] ** : 1.088418 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003448 | Grad Max: 0.095965 -> Layer: shared_layers.0.bias | Grad Mean: 0.187842 | Grad Max: 1.110170 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001849 | Grad Max: 0.008457 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000863 | Grad Max: 0.000863 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001537 | Grad Max: 0.106326 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029292 | Grad Max: 0.608191 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.010031 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013488 | Grad Max: 0.055453 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000905 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003173 | Grad Max: 0.007687 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000466 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001195 | Grad Max: 0.003239 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003099 | Grad Max: 0.006405 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042433 | Grad Max: 0.042433 [GRADIENT NORM TOTAL] 3.8708 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.232 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53506166 0.46493837] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/2046 | B: 156/1892 | C: 148/1228 [LOSS Ex1] A: 0.68182 | B: 0.68386 | C: 0.67994 [LOGITS Ex2 A] Mean Abs: 1.047 | Max: 4.590 [LOSS Ex2] A: 0.33481 | B: 0.43873 | C: 0.41616 ** [JOINT LOSS] ** : 1.078439 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001768 | Grad Max: 0.046445 -> Layer: shared_layers.0.bias | Grad Mean: 0.076378 | Grad Max: 0.416662 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001979 | Grad Max: 0.010232 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015725 | Grad Max: 0.015725 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000625 | Grad Max: 0.043979 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011122 | Grad Max: 0.249136 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000146 | Grad Max: 0.004188 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005020 | Grad Max: 0.021502 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000452 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001179 | Grad Max: 0.003329 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000203 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000440 | Grad Max: 0.001370 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001187 | Grad Max: 0.003772 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015630 | Grad Max: 0.015630 [GRADIENT NORM TOTAL] 1.5029 [EPOCH SUMMARY] Train Loss: 1.0745 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0641 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0679 -> New: 1.0641) ############################## EPOCH 22/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.262 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5055422 0.49445778] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.010 [MASKS] A(Pass/Fail): 3/2045 | B: 148/1900 | C: 231/1817 [LOSS Ex1] A: 0.68330 | B: 0.68195 | C: 0.67994 [LOGITS Ex2 A] Mean Abs: 1.088 | Max: 4.311 [LOSS Ex2] A: 0.34913 | B: 0.42867 | C: 0.42333 ** [JOINT LOSS] ** : 1.082106 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004530 | Grad Max: 0.120826 -> Layer: shared_layers.0.bias | Grad Mean: 0.223453 | Grad Max: 1.330951 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001918 | Grad Max: 0.009348 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014347 | Grad Max: 0.014347 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001856 | Grad Max: 0.112556 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035517 | Grad Max: 0.635956 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000470 | Grad Max: 0.010714 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016419 | Grad Max: 0.058066 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.001070 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003871 | Grad Max: 0.008693 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000615 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001460 | Grad Max: 0.003974 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003580 | Grad Max: 0.006972 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051331 | Grad Max: 0.051331 [GRADIENT NORM TOTAL] 4.5770 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53511965 0.46488038] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.010 [MASKS] A(Pass/Fail): 0/2048 | B: 153/1703 | C: 248/1800 [LOSS Ex1] A: 0.00000 | B: 0.68382 | C: 0.68020 [LOGITS Ex2 A] Mean Abs: 1.100 | Max: 4.449 [LOSS Ex2] A: 0.37156 | B: 0.42652 | C: 0.41810 ** [JOINT LOSS] ** : 0.860067 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006180 | Grad Max: 0.150771 -> Layer: shared_layers.0.bias | Grad Mean: 0.266865 | Grad Max: 1.610612 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001275 | Grad Max: 0.005157 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018048 | Grad Max: 0.018048 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002277 | Grad Max: 0.115358 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043128 | Grad Max: 0.642819 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000569 | Grad Max: 0.013880 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019719 | Grad Max: 0.071334 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000106 | Grad Max: 0.001283 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004653 | Grad Max: 0.010626 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000056 | Grad Max: 0.000683 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001755 | Grad Max: 0.004712 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004533 | Grad Max: 0.008591 -> Layer: exit2_layers.12.bias | Grad Mean: 0.062595 | Grad Max: 0.062595 [GRADIENT NORM TOTAL] 5.4504 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.143 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51918674 0.48081324] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.010 [MASKS] A(Pass/Fail): 2/1614 | B: 153/1895 | C: 244/1804 [LOSS Ex1] A: 0.68149 | B: 0.68347 | C: 0.67732 [LOGITS Ex2 A] Mean Abs: 1.115 | Max: 4.934 [LOSS Ex2] A: 0.32734 | B: 0.45755 | C: 0.41069 ** [JOINT LOSS] ** : 1.079283 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002300 | Grad Max: 0.055401 -> Layer: shared_layers.0.bias | Grad Mean: 0.092571 | Grad Max: 0.550082 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001971 | Grad Max: 0.009032 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001047 | Grad Max: 0.001047 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000782 | Grad Max: 0.076852 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014909 | Grad Max: 0.434616 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000190 | Grad Max: 0.005944 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006553 | Grad Max: 0.027953 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000500 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001528 | Grad Max: 0.003864 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000280 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000573 | Grad Max: 0.001928 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001357 | Grad Max: 0.003557 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020145 | Grad Max: 0.020145 [GRADIENT NORM TOTAL] 1.9485 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.263 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093078 0.4906922] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/2046 | B: 163/1885 | C: 244/1804 [LOSS Ex1] A: 0.68186 | B: 0.68377 | C: 0.67885 [LOGITS Ex2 A] Mean Abs: 1.066 | Max: 4.653 [LOSS Ex2] A: 0.35689 | B: 0.44502 | C: 0.40143 ** [JOINT LOSS] ** : 1.082612 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003889 | Grad Max: 0.097828 -> Layer: shared_layers.0.bias | Grad Mean: 0.183383 | Grad Max: 1.105089 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001966 | Grad Max: 0.010127 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014868 | Grad Max: 0.014868 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001527 | Grad Max: 0.083980 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028722 | Grad Max: 0.471678 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000382 | Grad Max: 0.007888 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013300 | Grad Max: 0.043242 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000894 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003152 | Grad Max: 0.007374 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000456 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001180 | Grad Max: 0.003214 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003013 | Grad Max: 0.005795 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041744 | Grad Max: 0.041744 [GRADIENT NORM TOTAL] 3.7291 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.253 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50175244 0.49824756] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/2046 | B: 149/1899 | C: 227/1821 [LOSS Ex1] A: 0.68156 | B: 0.68186 | C: 0.68034 [LOGITS Ex2 A] Mean Abs: 1.056 | Max: 4.537 [LOSS Ex2] A: 0.35316 | B: 0.43409 | C: 0.45169 ** [JOINT LOSS] ** : 1.094234 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006137 | Grad Max: 0.147072 -> Layer: shared_layers.0.bias | Grad Mean: 0.263439 | Grad Max: 1.541981 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001996 | Grad Max: 0.009893 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012832 | Grad Max: 0.012832 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002247 | Grad Max: 0.112763 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042291 | Grad Max: 0.624063 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000565 | Grad Max: 0.013859 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019631 | Grad Max: 0.070626 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000105 | Grad Max: 0.001298 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004653 | Grad Max: 0.010898 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000056 | Grad Max: 0.000743 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001756 | Grad Max: 0.004849 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004597 | Grad Max: 0.008323 -> Layer: exit2_layers.12.bias | Grad Mean: 0.062975 | Grad Max: 0.062975 [GRADIENT NORM TOTAL] 5.3535 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.259 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50742227 0.49257767] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 6/2042 | B: 154/1702 | C: 246/1802 [LOSS Ex1] A: 0.68068 | B: 0.68374 | C: 0.68020 [LOGITS Ex2 A] Mean Abs: 1.059 | Max: 4.535 [LOSS Ex2] A: 0.35725 | B: 0.41507 | C: 0.42078 ** [JOINT LOSS] ** : 1.079245 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002852 | Grad Max: 0.072068 -> Layer: shared_layers.0.bias | Grad Mean: 0.141598 | Grad Max: 0.831983 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001946 | Grad Max: 0.009864 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013707 | Grad Max: 0.013707 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001149 | Grad Max: 0.060755 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021966 | Grad Max: 0.352317 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000293 | Grad Max: 0.007297 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010265 | Grad Max: 0.037308 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000703 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002423 | Grad Max: 0.005888 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000399 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000908 | Grad Max: 0.002638 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002284 | Grad Max: 0.004926 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031776 | Grad Max: 0.031776 [GRADIENT NORM TOTAL] 2.8667 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.226 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5092548 0.49074516] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 3/2045 | B: 153/1895 | C: 222/1826 [LOSS Ex1] A: 0.68260 | B: 0.68338 | C: 0.68025 [LOGITS Ex2 A] Mean Abs: 1.078 | Max: 4.656 [LOSS Ex2] A: 0.35663 | B: 0.46104 | C: 0.41937 ** [JOINT LOSS] ** : 1.094421 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002446 | Grad Max: 0.051743 -> Layer: shared_layers.0.bias | Grad Mean: 0.134230 | Grad Max: 0.753089 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001730 | Grad Max: 0.008105 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003588 | Grad Max: 0.003588 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001067 | Grad Max: 0.063749 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020029 | Grad Max: 0.347898 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000267 | Grad Max: 0.006850 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009334 | Grad Max: 0.034395 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000718 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002189 | Grad Max: 0.005985 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000331 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000816 | Grad Max: 0.002289 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001906 | Grad Max: 0.004360 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028088 | Grad Max: 0.028088 [GRADIENT NORM TOTAL] 2.6999 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.160 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50794715 0.49205288] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.512 | Std: 0.010 [MASKS] A(Pass/Fail): 4/2044 | B: 165/1883 | C: 233/1815 [LOSS Ex1] A: 0.68220 | B: 0.68369 | C: 0.67952 [LOGITS Ex2 A] Mean Abs: 1.071 | Max: 4.442 [LOSS Ex2] A: 0.36048 | B: 0.45762 | C: 0.41785 ** [JOINT LOSS] ** : 1.093787 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004323 | Grad Max: 0.108526 -> Layer: shared_layers.0.bias | Grad Mean: 0.216808 | Grad Max: 1.245048 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001855 | Grad Max: 0.009215 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010434 | Grad Max: 0.010434 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001775 | Grad Max: 0.103796 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033606 | Grad Max: 0.579509 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000447 | Grad Max: 0.010943 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015688 | Grad Max: 0.056864 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.001128 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003710 | Grad Max: 0.008722 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000531 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001390 | Grad Max: 0.003663 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003401 | Grad Max: 0.006679 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048490 | Grad Max: 0.048490 [GRADIENT NORM TOTAL] 4.3841 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.235 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53586906 0.46413097] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 3/2045 | B: 150/1898 | C: 235/1813 [LOSS Ex1] A: 0.68163 | B: 0.68178 | C: 0.67905 [LOGITS Ex2 A] Mean Abs: 1.090 | Max: 4.642 [LOSS Ex2] A: 0.35271 | B: 0.43174 | C: 0.40936 ** [JOINT LOSS] ** : 1.078758 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002477 | Grad Max: 0.066457 -> Layer: shared_layers.0.bias | Grad Mean: 0.096364 | Grad Max: 0.555884 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.009716 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010576 | Grad Max: 0.010576 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000797 | Grad Max: 0.047287 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014807 | Grad Max: 0.261720 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000192 | Grad Max: 0.004690 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006641 | Grad Max: 0.024433 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000478 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001560 | Grad Max: 0.004153 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000290 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000595 | Grad Max: 0.001831 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001558 | Grad Max: 0.003997 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021985 | Grad Max: 0.021985 [GRADIENT NORM TOTAL] 1.9382 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.265 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.505439 0.49456105] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 3/2045 | B: 156/1700 | C: 223/1825 [LOSS Ex1] A: 0.68314 | B: 0.68366 | C: 0.67967 [LOGITS Ex2 A] Mean Abs: 1.058 | Max: 4.719 [LOSS Ex2] A: 0.35177 | B: 0.41659 | C: 0.41951 ** [JOINT LOSS] ** : 1.078117 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004553 | Grad Max: 0.128088 -> Layer: shared_layers.0.bias | Grad Mean: 0.158670 | Grad Max: 0.885726 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.009559 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016716 | Grad Max: 0.016716 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001392 | Grad Max: 0.071994 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025273 | Grad Max: 0.400589 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000330 | Grad Max: 0.006956 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011331 | Grad Max: 0.036252 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000842 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002713 | Grad Max: 0.006218 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000411 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001018 | Grad Max: 0.002806 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002780 | Grad Max: 0.005550 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036515 | Grad Max: 0.036515 [GRADIENT NORM TOTAL] 3.2278 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.102 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5358309 0.46416909] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 0/2048 | B: 154/1894 | C: 255/1793 [LOSS Ex1] A: 0.00000 | B: 0.68330 | C: 0.67855 [LOGITS Ex2 A] Mean Abs: 1.055 | Max: 4.855 [LOSS Ex2] A: 0.35852 | B: 0.45845 | C: 0.41381 ** [JOINT LOSS] ** : 0.864212 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005524 | Grad Max: 0.147295 -> Layer: shared_layers.0.bias | Grad Mean: 0.261749 | Grad Max: 1.524750 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001123 | Grad Max: 0.005215 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007476 | Grad Max: 0.007476 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.133882 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040615 | Grad Max: 0.730334 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.011662 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018673 | Grad Max: 0.065062 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000099 | Grad Max: 0.001192 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004418 | Grad Max: 0.010157 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000627 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001660 | Grad Max: 0.004375 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004291 | Grad Max: 0.008299 -> Layer: exit2_layers.12.bias | Grad Mean: 0.058848 | Grad Max: 0.058848 [GRADIENT NORM TOTAL] 5.2698 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.144 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5196843 0.4803157] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/1614 | B: 168/1880 | C: 205/1843 [LOSS Ex1] A: 0.68132 | B: 0.68362 | C: 0.68106 [LOGITS Ex2 A] Mean Abs: 1.089 | Max: 4.801 [LOSS Ex2] A: 0.34803 | B: 0.44282 | C: 0.41445 ** [JOINT LOSS] ** : 1.083769 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003406 | Grad Max: 0.089123 -> Layer: shared_layers.0.bias | Grad Mean: 0.145226 | Grad Max: 0.828051 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001836 | Grad Max: 0.009263 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008031 | Grad Max: 0.008031 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001191 | Grad Max: 0.063661 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022414 | Grad Max: 0.370680 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000304 | Grad Max: 0.008073 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010478 | Grad Max: 0.038783 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000702 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002469 | Grad Max: 0.006054 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000377 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000919 | Grad Max: 0.002503 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002452 | Grad Max: 0.005520 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032830 | Grad Max: 0.032830 [GRADIENT NORM TOTAL] 2.8901 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.266 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.509238 0.490762] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/2046 | B: 151/1897 | C: 249/1799 [LOSS Ex1] A: 0.68170 | B: 0.68169 | C: 0.67778 [LOGITS Ex2 A] Mean Abs: 1.122 | Max: 6.306 [LOSS Ex2] A: 0.35723 | B: 0.42676 | C: 0.40834 ** [JOINT LOSS] ** : 1.077832 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004634 | Grad Max: 0.100497 -> Layer: shared_layers.0.bias | Grad Mean: 0.173893 | Grad Max: 0.989522 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001986 | Grad Max: 0.008983 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001827 | Grad Max: 0.001827 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001492 | Grad Max: 0.074146 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027966 | Grad Max: 0.404381 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000363 | Grad Max: 0.009133 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012626 | Grad Max: 0.046685 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000821 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002987 | Grad Max: 0.007175 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000425 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001118 | Grad Max: 0.002898 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002849 | Grad Max: 0.006157 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039589 | Grad Max: 0.039589 [GRADIENT NORM TOTAL] 3.5310 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.255 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015807 0.4984193] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/2046 | B: 156/1700 | C: 171/1205 [LOSS Ex1] A: 0.68139 | B: 0.68358 | C: 0.67757 [LOGITS Ex2 A] Mean Abs: 1.135 | Max: 4.289 [LOSS Ex2] A: 0.36016 | B: 0.43713 | C: 0.38240 ** [JOINT LOSS] ** : 1.074075 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006461 | Grad Max: 0.159487 -> Layer: shared_layers.0.bias | Grad Mean: 0.270108 | Grad Max: 1.588480 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002032 | Grad Max: 0.010222 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015090 | Grad Max: 0.015090 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002261 | Grad Max: 0.109852 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042925 | Grad Max: 0.601425 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000564 | Grad Max: 0.013670 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019655 | Grad Max: 0.071287 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001325 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004634 | Grad Max: 0.011158 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000055 | Grad Max: 0.000678 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001741 | Grad Max: 0.004725 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004467 | Grad Max: 0.008718 -> Layer: exit2_layers.12.bias | Grad Mean: 0.062003 | Grad Max: 0.062003 [GRADIENT NORM TOTAL] 5.4593 [EPOCH SUMMARY] Train Loss: 1.0516 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0572 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0641 -> New: 1.0572) ############################## EPOCH 23/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.261 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50730497 0.492695 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 6/2042 | B: 156/1892 | C: 267/1781 [LOSS Ex1] A: 0.68050 | B: 0.68322 | C: 0.67606 [LOGITS Ex2 A] Mean Abs: 1.109 | Max: 5.238 [LOSS Ex2] A: 0.36415 | B: 0.45150 | C: 0.43027 ** [JOINT LOSS] ** : 1.095234 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005123 | Grad Max: 0.134992 -> Layer: shared_layers.0.bias | Grad Mean: 0.161182 | Grad Max: 0.906454 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.009538 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004126 | Grad Max: 0.004126 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001396 | Grad Max: 0.067089 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026093 | Grad Max: 0.335241 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000343 | Grad Max: 0.009482 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011749 | Grad Max: 0.048084 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000801 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002778 | Grad Max: 0.006571 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000427 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001030 | Grad Max: 0.002796 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002592 | Grad Max: 0.005250 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035586 | Grad Max: 0.035586 [GRADIENT NORM TOTAL] 3.2385 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.228 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50918573 0.49081424] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 3/2045 | B: 168/1880 | C: 223/1825 [LOSS Ex1] A: 0.68246 | B: 0.68354 | C: 0.68091 [LOGITS Ex2 A] Mean Abs: 1.040 | Max: 5.031 [LOSS Ex2] A: 0.35954 | B: 0.44352 | C: 0.42824 ** [JOINT LOSS] ** : 1.092738 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003122 | Grad Max: 0.075464 -> Layer: shared_layers.0.bias | Grad Mean: 0.142882 | Grad Max: 0.873323 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001773 | Grad Max: 0.008816 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009772 | Grad Max: 0.009772 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001217 | Grad Max: 0.092670 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022880 | Grad Max: 0.529027 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000299 | Grad Max: 0.008136 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010440 | Grad Max: 0.040381 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000713 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002451 | Grad Max: 0.006162 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000382 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000920 | Grad Max: 0.002681 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002457 | Grad Max: 0.005330 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032940 | Grad Max: 0.032940 [GRADIENT NORM TOTAL] 3.0035 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.161 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082023 0.49179772] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 5/2043 | B: 152/1896 | C: 244/1804 [LOSS Ex1] A: 0.68206 | B: 0.68161 | C: 0.67851 [LOGITS Ex2 A] Mean Abs: 1.018 | Max: 4.800 [LOSS Ex2] A: 0.35070 | B: 0.42319 | C: 0.39795 ** [JOINT LOSS] ** : 1.071342 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004931 | Grad Max: 0.127516 -> Layer: shared_layers.0.bias | Grad Mean: 0.234615 | Grad Max: 1.315274 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001881 | Grad Max: 0.008601 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002700 | Grad Max: 0.002700 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001920 | Grad Max: 0.104016 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036381 | Grad Max: 0.575461 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000484 | Grad Max: 0.011028 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016938 | Grad Max: 0.058081 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001076 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003981 | Grad Max: 0.009098 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000548 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001484 | Grad Max: 0.003854 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003793 | Grad Max: 0.007835 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051802 | Grad Max: 0.051802 [GRADIENT NORM TOTAL] 4.6856 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.237 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53661335 0.46338665] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 3/2045 | B: 157/1699 | C: 261/1787 [LOSS Ex1] A: 0.68146 | B: 0.68351 | C: 0.67818 [LOGITS Ex2 A] Mean Abs: 1.073 | Max: 4.766 [LOSS Ex2] A: 0.32734 | B: 0.40819 | C: 0.40441 ** [JOINT LOSS] ** : 1.061030 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.060042 -> Layer: shared_layers.0.bias | Grad Mean: 0.127312 | Grad Max: 0.687334 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001931 | Grad Max: 0.009782 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011640 | Grad Max: 0.011640 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001028 | Grad Max: 0.054621 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019191 | Grad Max: 0.302755 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.006130 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009034 | Grad Max: 0.032004 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000671 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002131 | Grad Max: 0.005466 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000320 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000787 | Grad Max: 0.002155 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001974 | Grad Max: 0.005511 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026874 | Grad Max: 0.026874 [GRADIENT NORM TOTAL] 2.5333 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.267 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50536186 0.49463814] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 3/2045 | B: 157/1891 | C: 229/1819 [LOSS Ex1] A: 0.68300 | B: 0.68315 | C: 0.68010 [LOGITS Ex2 A] Mean Abs: 1.109 | Max: 4.545 [LOSS Ex2] A: 0.34968 | B: 0.45285 | C: 0.40700 ** [JOINT LOSS] ** : 1.085255 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003208 | Grad Max: 0.080885 -> Layer: shared_layers.0.bias | Grad Mean: 0.175309 | Grad Max: 1.052779 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001812 | Grad Max: 0.008680 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010630 | Grad Max: 0.010630 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001396 | Grad Max: 0.082477 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026940 | Grad Max: 0.450518 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.010607 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012598 | Grad Max: 0.048764 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000853 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002931 | Grad Max: 0.007241 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000430 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001093 | Grad Max: 0.003025 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002648 | Grad Max: 0.005377 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037911 | Grad Max: 0.037911 [GRADIENT NORM TOTAL] 3.5679 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.104 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5364586 0.4635414] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/2046 | B: 169/1879 | C: 214/1834 [LOSS Ex1] A: 0.68193 | B: 0.68347 | C: 0.68147 [LOGITS Ex2 A] Mean Abs: 1.129 | Max: 4.474 [LOSS Ex2] A: 0.36081 | B: 0.45058 | C: 0.40625 ** [JOINT LOSS] ** : 1.088163 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006110 | Grad Max: 0.144825 -> Layer: shared_layers.0.bias | Grad Mean: 0.276859 | Grad Max: 1.653459 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001845 | Grad Max: 0.009412 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012852 | Grad Max: 0.012852 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002325 | Grad Max: 0.118838 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043801 | Grad Max: 0.663423 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000566 | Grad Max: 0.013760 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019751 | Grad Max: 0.068547 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001432 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004644 | Grad Max: 0.011519 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000055 | Grad Max: 0.000666 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001732 | Grad Max: 0.004435 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004330 | Grad Max: 0.008344 -> Layer: exit2_layers.12.bias | Grad Mean: 0.060178 | Grad Max: 0.060178 [GRADIENT NORM TOTAL] 5.6519 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.146 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52012134 0.47987872] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/1614 | B: 152/1896 | C: 263/1785 [LOSS Ex1] A: 0.68116 | B: 0.68153 | C: 0.67714 [LOGITS Ex2 A] Mean Abs: 1.146 | Max: 4.559 [LOSS Ex2] A: 0.33818 | B: 0.42638 | C: 0.40965 ** [JOINT LOSS] ** : 1.071349 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002416 | Grad Max: 0.059987 -> Layer: shared_layers.0.bias | Grad Mean: 0.127263 | Grad Max: 0.740647 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.008958 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000095 | Grad Max: 0.000095 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001020 | Grad Max: 0.081629 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019382 | Grad Max: 0.463103 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.006045 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008991 | Grad Max: 0.031007 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000572 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002098 | Grad Max: 0.004899 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000313 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000787 | Grad Max: 0.002281 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001785 | Grad Max: 0.003977 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027237 | Grad Max: 0.027237 [GRADIENT NORM TOTAL] 2.6094 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.268 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5092207 0.4907793] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/2046 | B: 159/1697 | C: 249/1799 [LOSS Ex1] A: 0.68153 | B: 0.68344 | C: 0.67904 [LOGITS Ex2 A] Mean Abs: 1.097 | Max: 5.145 [LOSS Ex2] A: 0.34634 | B: 0.42282 | C: 0.38381 ** [JOINT LOSS] ** : 1.065659 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003504 | Grad Max: 0.088789 -> Layer: shared_layers.0.bias | Grad Mean: 0.124085 | Grad Max: 0.720976 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.010128 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018072 | Grad Max: 0.018072 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001097 | Grad Max: 0.053525 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020447 | Grad Max: 0.295581 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000266 | Grad Max: 0.006112 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009235 | Grad Max: 0.032819 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000606 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002174 | Grad Max: 0.005052 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000334 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000800 | Grad Max: 0.002313 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002096 | Grad Max: 0.004965 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027565 | Grad Max: 0.027565 [GRADIENT NORM TOTAL] 2.5361 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.257 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50149214 0.4985078 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/2046 | B: 158/1890 | C: 230/1818 [LOSS Ex1] A: 0.68120 | B: 0.68308 | C: 0.67914 [LOGITS Ex2 A] Mean Abs: 1.081 | Max: 5.476 [LOSS Ex2] A: 0.34929 | B: 0.46600 | C: 0.42607 ** [JOINT LOSS] ** : 1.094927 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004754 | Grad Max: 0.104565 -> Layer: shared_layers.0.bias | Grad Mean: 0.202685 | Grad Max: 1.161020 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.010375 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015991 | Grad Max: 0.015991 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001711 | Grad Max: 0.101170 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032033 | Grad Max: 0.559006 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000412 | Grad Max: 0.010159 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014440 | Grad Max: 0.051688 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.001023 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003432 | Grad Max: 0.008354 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000524 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001291 | Grad Max: 0.003547 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003360 | Grad Max: 0.006942 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045876 | Grad Max: 0.045876 [GRADIENT NORM TOTAL] 4.0878 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.263 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50723255 0.49276745] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 6/2042 | B: 170/1878 | C: 228/1820 [LOSS Ex1] A: 0.68032 | B: 0.68340 | C: 0.67909 [LOGITS Ex2 A] Mean Abs: 1.082 | Max: 4.687 [LOSS Ex2] A: 0.34881 | B: 0.43449 | C: 0.41455 ** [JOINT LOSS] ** : 1.080215 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.055829 -> Layer: shared_layers.0.bias | Grad Mean: 0.074723 | Grad Max: 0.483982 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001953 | Grad Max: 0.009645 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008926 | Grad Max: 0.008926 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000672 | Grad Max: 0.048698 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011689 | Grad Max: 0.267615 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.004006 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005018 | Grad Max: 0.019550 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000391 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001172 | Grad Max: 0.003126 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000205 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000435 | Grad Max: 0.001231 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001112 | Grad Max: 0.003244 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015744 | Grad Max: 0.015744 [GRADIENT NORM TOTAL] 1.5693 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.229 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50913477 0.49086523] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 3/2045 | B: 154/1894 | C: 228/1820 [LOSS Ex1] A: 0.68231 | B: 0.68145 | C: 0.67834 [LOGITS Ex2 A] Mean Abs: 1.101 | Max: 4.554 [LOSS Ex2] A: 0.34866 | B: 0.43836 | C: 0.40435 ** [JOINT LOSS] ** : 1.077823 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004630 | Grad Max: 0.116870 -> Layer: shared_layers.0.bias | Grad Mean: 0.205758 | Grad Max: 1.181603 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001864 | Grad Max: 0.008241 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001311 | Grad Max: 0.001311 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001696 | Grad Max: 0.093407 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032136 | Grad Max: 0.522941 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000424 | Grad Max: 0.010538 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014777 | Grad Max: 0.053937 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000918 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003453 | Grad Max: 0.007942 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000535 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001286 | Grad Max: 0.003661 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003147 | Grad Max: 0.006693 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044823 | Grad Max: 0.044823 [GRADIENT NORM TOTAL] 4.1581 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.034 | Max: 0.162 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084509 0.49154907] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 5/2043 | B: 159/1697 | C: 236/1812 [LOSS Ex1] A: 0.68190 | B: 0.68336 | C: 0.67997 [LOGITS Ex2 A] Mean Abs: 1.104 | Max: 4.542 [LOSS Ex2] A: 0.36188 | B: 0.43899 | C: 0.42175 ** [JOINT LOSS] ** : 1.089285 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005803 | Grad Max: 0.134861 -> Layer: shared_layers.0.bias | Grad Mean: 0.270687 | Grad Max: 1.551564 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001755 | Grad Max: 0.008655 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005047 | Grad Max: 0.005047 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.114568 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041495 | Grad Max: 0.643473 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000546 | Grad Max: 0.012120 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019141 | Grad Max: 0.064964 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001297 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004485 | Grad Max: 0.010512 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000646 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001669 | Grad Max: 0.004399 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004182 | Grad Max: 0.008199 -> Layer: exit2_layers.12.bias | Grad Mean: 0.058227 | Grad Max: 0.058227 [GRADIENT NORM TOTAL] 5.3684 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.240 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5374016 0.46259835] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 4/2044 | B: 159/1889 | C: 209/1839 [LOSS Ex1] A: 0.68127 | B: 0.68300 | C: 0.67999 [LOGITS Ex2 A] Mean Abs: 1.118 | Max: 4.854 [LOSS Ex2] A: 0.34661 | B: 0.45333 | C: 0.42443 ** [JOINT LOSS] ** : 1.089543 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003652 | Grad Max: 0.099446 -> Layer: shared_layers.0.bias | Grad Mean: 0.160824 | Grad Max: 0.976620 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.010062 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018914 | Grad Max: 0.018914 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001342 | Grad Max: 0.071363 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025257 | Grad Max: 0.396722 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000336 | Grad Max: 0.007776 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011813 | Grad Max: 0.040404 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000770 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002778 | Grad Max: 0.006808 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000394 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001030 | Grad Max: 0.002741 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002474 | Grad Max: 0.004747 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035247 | Grad Max: 0.035247 [GRADIENT NORM TOTAL] 3.3180 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.270 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5052959 0.4947041] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 3/2045 | B: 171/1877 | C: 165/1211 [LOSS Ex1] A: 0.68283 | B: 0.68332 | C: 0.67916 [LOGITS Ex2 A] Mean Abs: 1.088 | Max: 4.635 [LOSS Ex2] A: 0.34082 | B: 0.43442 | C: 0.42400 ** [JOINT LOSS] ** : 1.081518 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005606 | Grad Max: 0.159080 -> Layer: shared_layers.0.bias | Grad Mean: 0.142657 | Grad Max: 0.736171 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001917 | Grad Max: 0.009620 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017887 | Grad Max: 0.017887 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001273 | Grad Max: 0.063002 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022689 | Grad Max: 0.329573 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.007124 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009630 | Grad Max: 0.035300 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000780 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002320 | Grad Max: 0.005474 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000366 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000859 | Grad Max: 0.002322 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002252 | Grad Max: 0.004933 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029398 | Grad Max: 0.029398 [GRADIENT NORM TOTAL] 2.8291 [EPOCH SUMMARY] Train Loss: 1.0817 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0599 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 24/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.105 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53719544 0.4628045 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 3/2045 | B: 156/1892 | C: 229/1819 [LOSS Ex1] A: 0.68175 | B: 0.68136 | C: 0.67941 [LOGITS Ex2 A] Mean Abs: 1.084 | Max: 4.936 [LOSS Ex2] A: 0.36269 | B: 0.42657 | C: 0.41507 ** [JOINT LOSS] ** : 1.082281 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005779 | Grad Max: 0.156171 -> Layer: shared_layers.0.bias | Grad Mean: 0.235898 | Grad Max: 1.361046 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001896 | Grad Max: 0.008961 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005831 | Grad Max: 0.005831 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002016 | Grad Max: 0.111760 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037954 | Grad Max: 0.631322 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000497 | Grad Max: 0.011450 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017344 | Grad Max: 0.060775 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001154 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004106 | Grad Max: 0.009510 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000599 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001530 | Grad Max: 0.004175 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004014 | Grad Max: 0.007576 -> Layer: exit2_layers.12.bias | Grad Mean: 0.054429 | Grad Max: 0.054429 [GRADIENT NORM TOTAL] 4.7675 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.148 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5206508 0.47934923] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/1614 | B: 160/1696 | C: 223/1825 [LOSS Ex1] A: 0.68098 | B: 0.68329 | C: 0.68014 [LOGITS Ex2 A] Mean Abs: 1.104 | Max: 4.838 [LOSS Ex2] A: 0.34186 | B: 0.42207 | C: 0.40549 ** [JOINT LOSS] ** : 1.071272 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003545 | Grad Max: 0.088657 -> Layer: shared_layers.0.bias | Grad Mean: 0.143906 | Grad Max: 0.787755 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001871 | Grad Max: 0.009759 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009751 | Grad Max: 0.009751 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001177 | Grad Max: 0.091146 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021577 | Grad Max: 0.498778 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000281 | Grad Max: 0.007044 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009806 | Grad Max: 0.037234 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000822 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002332 | Grad Max: 0.006447 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000319 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000859 | Grad Max: 0.002217 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002254 | Grad Max: 0.005331 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030139 | Grad Max: 0.030139 [GRADIENT NORM TOTAL] 2.8856 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.271 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50915897 0.49084103] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.012 [MASKS] A(Pass/Fail): 2/2046 | B: 160/1888 | C: 242/1806 [LOSS Ex1] A: 0.68134 | B: 0.68292 | C: 0.67935 [LOGITS Ex2 A] Mean Abs: 1.131 | Max: 6.331 [LOSS Ex2] A: 0.34749 | B: 0.45213 | C: 0.40974 ** [JOINT LOSS] ** : 1.084322 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003417 | Grad Max: 0.075436 -> Layer: shared_layers.0.bias | Grad Mean: 0.125710 | Grad Max: 0.759782 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001874 | Grad Max: 0.009211 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008991 | Grad Max: 0.008991 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001075 | Grad Max: 0.081204 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019848 | Grad Max: 0.453262 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000251 | Grad Max: 0.007646 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008755 | Grad Max: 0.040218 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000822 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002061 | Grad Max: 0.006558 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000358 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000766 | Grad Max: 0.002317 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001890 | Grad Max: 0.004055 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026532 | Grad Max: 0.026532 [GRADIENT NORM TOTAL] 2.5679 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.260 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50131977 0.49868017] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 2/2046 | B: 172/1876 | C: 246/1802 [LOSS Ex1] A: 0.68101 | B: 0.68324 | C: 0.67880 [LOGITS Ex2 A] Mean Abs: 1.141 | Max: 4.606 [LOSS Ex2] A: 0.35623 | B: 0.44654 | C: 0.40406 ** [JOINT LOSS] ** : 1.083287 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005331 | Grad Max: 0.138473 -> Layer: shared_layers.0.bias | Grad Mean: 0.234160 | Grad Max: 1.371714 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.010696 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019404 | Grad Max: 0.019404 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.113130 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036972 | Grad Max: 0.610887 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.012103 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016765 | Grad Max: 0.061467 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.001060 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003927 | Grad Max: 0.009582 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000537 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001461 | Grad Max: 0.003896 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003589 | Grad Max: 0.007127 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050584 | Grad Max: 0.050584 [GRADIENT NORM TOTAL] 4.7647 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.266 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070989 0.49290103] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.012 [MASKS] A(Pass/Fail): 7/2041 | B: 157/1891 | C: 239/1809 [LOSS Ex1] A: 0.68012 | B: 0.68128 | C: 0.67887 [LOGITS Ex2 A] Mean Abs: 1.121 | Max: 5.463 [LOSS Ex2] A: 0.34508 | B: 0.42127 | C: 0.41714 ** [JOINT LOSS] ** : 1.074588 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004324 | Grad Max: 0.124000 -> Layer: shared_layers.0.bias | Grad Mean: 0.110084 | Grad Max: 0.600424 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.009860 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008881 | Grad Max: 0.008881 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001014 | Grad Max: 0.051194 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018416 | Grad Max: 0.255523 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.006045 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007798 | Grad Max: 0.032367 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000542 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001858 | Grad Max: 0.004671 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000318 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000697 | Grad Max: 0.002020 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001725 | Grad Max: 0.004311 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024042 | Grad Max: 0.024042 [GRADIENT NORM TOTAL] 2.2659 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.231 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5090434 0.4909566] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 4/2044 | B: 160/1696 | C: 220/1828 [LOSS Ex1] A: 0.68215 | B: 0.68321 | C: 0.68025 [LOGITS Ex2 A] Mean Abs: 1.065 | Max: 4.595 [LOSS Ex2] A: 0.33837 | B: 0.42728 | C: 0.42476 ** [JOINT LOSS] ** : 1.078676 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003368 | Grad Max: 0.095551 -> Layer: shared_layers.0.bias | Grad Mean: 0.195502 | Grad Max: 1.123464 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001793 | Grad Max: 0.008783 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010798 | Grad Max: 0.010798 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001609 | Grad Max: 0.093884 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030371 | Grad Max: 0.542695 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.009561 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014124 | Grad Max: 0.048398 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000877 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003310 | Grad Max: 0.007623 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000448 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001230 | Grad Max: 0.003221 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003149 | Grad Max: 0.006619 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043132 | Grad Max: 0.043132 [GRADIENT NORM TOTAL] 4.0290 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.163 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50874764 0.49125236] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 7/2041 | B: 160/1888 | C: 264/1784 [LOSS Ex1] A: 0.68174 | B: 0.68284 | C: 0.67739 [LOGITS Ex2 A] Mean Abs: 1.034 | Max: 5.234 [LOSS Ex2] A: 0.36667 | B: 0.46613 | C: 0.42197 ** [JOINT LOSS] ** : 1.098917 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005445 | Grad Max: 0.142066 -> Layer: shared_layers.0.bias | Grad Mean: 0.267396 | Grad Max: 1.584014 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001853 | Grad Max: 0.008849 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004828 | Grad Max: 0.004828 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002238 | Grad Max: 0.118913 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042048 | Grad Max: 0.664049 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000551 | Grad Max: 0.013448 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019375 | Grad Max: 0.068838 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001287 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004560 | Grad Max: 0.011192 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000635 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001707 | Grad Max: 0.004382 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004362 | Grad Max: 0.008337 -> Layer: exit2_layers.12.bias | Grad Mean: 0.060548 | Grad Max: 0.060548 [GRADIENT NORM TOTAL] 5.4516 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.242 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5382747 0.4617253] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 6/2042 | B: 174/1874 | C: 251/1797 [LOSS Ex1] A: 0.68107 | B: 0.68317 | C: 0.67726 [LOGITS Ex2 A] Mean Abs: 1.085 | Max: 4.626 [LOSS Ex2] A: 0.32557 | B: 0.43986 | C: 0.40084 ** [JOINT LOSS] ** : 1.069260 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003773 | Grad Max: 0.095078 -> Layer: shared_layers.0.bias | Grad Mean: 0.164695 | Grad Max: 0.964963 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001988 | Grad Max: 0.009767 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013171 | Grad Max: 0.013171 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001393 | Grad Max: 0.075752 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026155 | Grad Max: 0.399335 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000350 | Grad Max: 0.009505 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012242 | Grad Max: 0.046500 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000887 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002858 | Grad Max: 0.007058 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000405 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001056 | Grad Max: 0.002850 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002738 | Grad Max: 0.005976 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036665 | Grad Max: 0.036665 [GRADIENT NORM TOTAL] 3.3658 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.273 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5051664 0.4948336] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.012 [MASKS] A(Pass/Fail): 3/2045 | B: 157/1891 | C: 234/1814 [LOSS Ex1] A: 0.68268 | B: 0.68120 | C: 0.67909 [LOGITS Ex2 A] Mean Abs: 1.105 | Max: 4.609 [LOSS Ex2] A: 0.33773 | B: 0.42479 | C: 0.39714 ** [JOINT LOSS] ** : 1.067543 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.062917 -> Layer: shared_layers.0.bias | Grad Mean: 0.148141 | Grad Max: 0.843071 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001852 | Grad Max: 0.008602 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010285 | Grad Max: 0.010285 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001115 | Grad Max: 0.080057 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021397 | Grad Max: 0.464312 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.007925 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010103 | Grad Max: 0.038237 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000678 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002352 | Grad Max: 0.005542 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000367 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000882 | Grad Max: 0.002380 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002015 | Grad Max: 0.004701 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030295 | Grad Max: 0.030295 [GRADIENT NORM TOTAL] 3.0180 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.107 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53793776 0.4620622 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 4/2044 | B: 160/1696 | C: 225/1823 [LOSS Ex1] A: 0.68158 | B: 0.68314 | C: 0.67868 [LOGITS Ex2 A] Mean Abs: 1.138 | Max: 4.759 [LOSS Ex2] A: 0.34235 | B: 0.43590 | C: 0.40295 ** [JOINT LOSS] ** : 1.074864 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004622 | Grad Max: 0.105975 -> Layer: shared_layers.0.bias | Grad Mean: 0.216424 | Grad Max: 1.275622 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001839 | Grad Max: 0.009111 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008047 | Grad Max: 0.008047 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001746 | Grad Max: 0.101602 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033165 | Grad Max: 0.569627 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.010862 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015194 | Grad Max: 0.054520 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001021 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003555 | Grad Max: 0.008583 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000503 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001323 | Grad Max: 0.003653 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003260 | Grad Max: 0.006586 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046741 | Grad Max: 0.046741 [GRADIENT NORM TOTAL] 4.3856 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.150 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.521187 0.478813] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 3/1613 | B: 161/1887 | C: 232/1816 [LOSS Ex1] A: 0.68080 | B: 0.68277 | C: 0.67960 [LOGITS Ex2 A] Mean Abs: 1.164 | Max: 5.040 [LOSS Ex2] A: 0.32899 | B: 0.44851 | C: 0.39416 ** [JOINT LOSS] ** : 1.071608 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002371 | Grad Max: 0.048704 -> Layer: shared_layers.0.bias | Grad Mean: 0.106709 | Grad Max: 0.642281 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001871 | Grad Max: 0.009273 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007015 | Grad Max: 0.007015 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000863 | Grad Max: 0.044347 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016138 | Grad Max: 0.250014 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000210 | Grad Max: 0.005371 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007436 | Grad Max: 0.027384 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000607 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001726 | Grad Max: 0.004476 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000281 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000645 | Grad Max: 0.001766 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001568 | Grad Max: 0.003841 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023117 | Grad Max: 0.023117 [GRADIENT NORM TOTAL] 2.1283 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.273 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5090752 0.49092472] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 2/2046 | B: 175/1873 | C: 249/1799 [LOSS Ex1] A: 0.68116 | B: 0.68309 | C: 0.67758 [LOGITS Ex2 A] Mean Abs: 1.107 | Max: 5.925 [LOSS Ex2] A: 0.33903 | B: 0.43310 | C: 0.39019 ** [JOINT LOSS] ** : 1.068052 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003802 | Grad Max: 0.086686 -> Layer: shared_layers.0.bias | Grad Mean: 0.142097 | Grad Max: 0.814293 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001914 | Grad Max: 0.009382 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010236 | Grad Max: 0.010236 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001198 | Grad Max: 0.062082 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022530 | Grad Max: 0.334883 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000294 | Grad Max: 0.008182 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010266 | Grad Max: 0.037962 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000763 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002411 | Grad Max: 0.006010 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000390 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000886 | Grad Max: 0.002501 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002340 | Grad Max: 0.005571 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030911 | Grad Max: 0.030911 [GRADIENT NORM TOTAL] 2.8513 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.262 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50117713 0.49882287] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.012 [MASKS] A(Pass/Fail): 2/2046 | B: 157/1891 | C: 246/1802 [LOSS Ex1] A: 0.68080 | B: 0.68112 | C: 0.67795 [LOGITS Ex2 A] Mean Abs: 1.089 | Max: 4.684 [LOSS Ex2] A: 0.32890 | B: 0.42163 | C: 0.41786 ** [JOINT LOSS] ** : 1.069418 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003918 | Grad Max: 0.098783 -> Layer: shared_layers.0.bias | Grad Mean: 0.196867 | Grad Max: 1.109958 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.010111 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011786 | Grad Max: 0.011786 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001565 | Grad Max: 0.090082 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029422 | Grad Max: 0.503554 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.010091 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013748 | Grad Max: 0.050058 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000907 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003231 | Grad Max: 0.007859 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000510 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001198 | Grad Max: 0.003182 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003081 | Grad Max: 0.006310 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042254 | Grad Max: 0.042254 [GRADIENT NORM TOTAL] 3.9163 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.268 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50697035 0.49302968] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.012 [MASKS] A(Pass/Fail): 7/2041 | B: 160/1696 | C: 162/1214 [LOSS Ex1] A: 0.67992 | B: 0.68306 | C: 0.67894 [LOGITS Ex2 A] Mean Abs: 1.099 | Max: 4.947 [LOSS Ex2] A: 0.34784 | B: 0.41105 | C: 0.40641 ** [JOINT LOSS] ** : 1.069074 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002147 | Grad Max: 0.057548 -> Layer: shared_layers.0.bias | Grad Mean: 0.076734 | Grad Max: 0.405680 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.009995 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010167 | Grad Max: 0.010167 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000644 | Grad Max: 0.037646 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010847 | Grad Max: 0.208135 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.004111 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004279 | Grad Max: 0.017468 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000377 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000973 | Grad Max: 0.002720 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000182 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000359 | Grad Max: 0.001122 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000930 | Grad Max: 0.003295 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012887 | Grad Max: 0.012887 [GRADIENT NORM TOTAL] 1.5220 [EPOCH SUMMARY] Train Loss: 1.0759 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0547 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0572 -> New: 1.0547) ############################## EPOCH 25/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.233 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5089481 0.4910519] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 4/2044 | B: 161/1887 | C: 229/1819 [LOSS Ex1] A: 0.68199 | B: 0.68268 | C: 0.68025 [LOGITS Ex2 A] Mean Abs: 1.128 | Max: 4.698 [LOSS Ex2] A: 0.35326 | B: 0.45296 | C: 0.41808 ** [JOINT LOSS] ** : 1.089742 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004843 | Grad Max: 0.108671 -> Layer: shared_layers.0.bias | Grad Mean: 0.230304 | Grad Max: 1.293460 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001813 | Grad Max: 0.008688 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010947 | Grad Max: 0.010947 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001801 | Grad Max: 0.093676 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034280 | Grad Max: 0.519659 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000449 | Grad Max: 0.011896 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015904 | Grad Max: 0.062022 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000965 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003707 | Grad Max: 0.008567 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000508 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001368 | Grad Max: 0.003656 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003271 | Grad Max: 0.006419 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047072 | Grad Max: 0.047072 [GRADIENT NORM TOTAL] 4.5141 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.035 | Max: 0.164 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5090251 0.4909749] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.011 [MASKS] A(Pass/Fail): 7/2041 | B: 175/1873 | C: 240/1808 [LOSS Ex1] A: 0.68158 | B: 0.68301 | C: 0.67904 [LOGITS Ex2 A] Mean Abs: 1.110 | Max: 4.626 [LOSS Ex2] A: 0.35067 | B: 0.44416 | C: 0.41265 ** [JOINT LOSS] ** : 1.083704 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005661 | Grad Max: 0.142322 -> Layer: shared_layers.0.bias | Grad Mean: 0.278661 | Grad Max: 1.562868 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001806 | Grad Max: 0.008975 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006337 | Grad Max: 0.006337 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.125159 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042120 | Grad Max: 0.698345 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000555 | Grad Max: 0.013763 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019571 | Grad Max: 0.071753 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001243 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004580 | Grad Max: 0.011031 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000637 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001695 | Grad Max: 0.004275 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004195 | Grad Max: 0.007837 -> Layer: exit2_layers.12.bias | Grad Mean: 0.058820 | Grad Max: 0.058820 [GRADIENT NORM TOTAL] 5.5238 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.244 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5391501 0.46084985] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 6/2042 | B: 157/1891 | C: 231/1817 [LOSS Ex1] A: 0.68087 | B: 0.68103 | C: 0.67939 [LOGITS Ex2 A] Mean Abs: 1.126 | Max: 4.938 [LOSS Ex2] A: 0.33647 | B: 0.42158 | C: 0.42382 ** [JOINT LOSS] ** : 1.074385 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003219 | Grad Max: 0.077269 -> Layer: shared_layers.0.bias | Grad Mean: 0.147166 | Grad Max: 0.847878 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002030 | Grad Max: 0.010012 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015489 | Grad Max: 0.015489 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001183 | Grad Max: 0.064464 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022367 | Grad Max: 0.367199 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000295 | Grad Max: 0.006500 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010389 | Grad Max: 0.037420 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000747 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002416 | Grad Max: 0.006029 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000334 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000895 | Grad Max: 0.002371 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002098 | Grad Max: 0.004960 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030877 | Grad Max: 0.030877 [GRADIENT NORM TOTAL] 2.9663 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.275 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50507116 0.49492887] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 3/2045 | B: 161/1695 | C: 213/1835 [LOSS Ex1] A: 0.68250 | B: 0.68297 | C: 0.68031 [LOGITS Ex2 A] Mean Abs: 1.105 | Max: 4.679 [LOSS Ex2] A: 0.33248 | B: 0.42202 | C: 0.43180 ** [JOINT LOSS] ** : 1.077358 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006064 | Grad Max: 0.169343 -> Layer: shared_layers.0.bias | Grad Mean: 0.222752 | Grad Max: 1.245109 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001713 | Grad Max: 0.008382 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007349 | Grad Max: 0.007349 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001934 | Grad Max: 0.108808 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035697 | Grad Max: 0.617060 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.011800 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016006 | Grad Max: 0.061284 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.001170 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003769 | Grad Max: 0.009130 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000551 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001388 | Grad Max: 0.003699 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003591 | Grad Max: 0.006955 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048568 | Grad Max: 0.048568 [GRADIENT NORM TOTAL] 4.4921 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.109 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5387489 0.4612511] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 4/2044 | B: 162/1886 | C: 257/1791 [LOSS Ex1] A: 0.68139 | B: 0.68260 | C: 0.67743 [LOGITS Ex2 A] Mean Abs: 1.090 | Max: 4.787 [LOSS Ex2] A: 0.35660 | B: 0.45194 | C: 0.43048 ** [JOINT LOSS] ** : 1.093479 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007445 | Grad Max: 0.193810 -> Layer: shared_layers.0.bias | Grad Mean: 0.307302 | Grad Max: 1.745826 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001868 | Grad Max: 0.008796 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001639 | Grad Max: 0.001639 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002630 | Grad Max: 0.205326 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049626 | Grad Max: 1.151001 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000642 | Grad Max: 0.016266 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022569 | Grad Max: 0.081059 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000116 | Grad Max: 0.001464 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005293 | Grad Max: 0.012205 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000061 | Grad Max: 0.000717 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001964 | Grad Max: 0.005143 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005063 | Grad Max: 0.009782 -> Layer: exit2_layers.12.bias | Grad Mean: 0.069296 | Grad Max: 0.069296 [GRADIENT NORM TOTAL] 6.3141 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.152 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52175844 0.47824153] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 3/1613 | B: 177/1871 | C: 244/1804 [LOSS Ex1] A: 0.68060 | B: 0.68293 | C: 0.67705 [LOGITS Ex2 A] Mean Abs: 1.129 | Max: 4.950 [LOSS Ex2] A: 0.33564 | B: 0.44042 | C: 0.40237 ** [JOINT LOSS] ** : 1.073005 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005441 | Grad Max: 0.134084 -> Layer: shared_layers.0.bias | Grad Mean: 0.185278 | Grad Max: 1.068156 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001974 | Grad Max: 0.009737 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009575 | Grad Max: 0.009575 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001634 | Grad Max: 0.128289 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030397 | Grad Max: 0.683243 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000391 | Grad Max: 0.010064 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013663 | Grad Max: 0.050444 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000876 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003211 | Grad Max: 0.007293 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000517 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001186 | Grad Max: 0.003270 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003212 | Grad Max: 0.006562 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042219 | Grad Max: 0.042219 [GRADIENT NORM TOTAL] 3.8085 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.276 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50900453 0.49099547] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 3/2045 | B: 161/1887 | C: 247/1801 [LOSS Ex1] A: 0.68096 | B: 0.68094 | C: 0.67762 [LOGITS Ex2 A] Mean Abs: 1.166 | Max: 4.441 [LOSS Ex2] A: 0.33323 | B: 0.42764 | C: 0.39570 ** [JOINT LOSS] ** : 1.065363 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002594 | Grad Max: 0.054757 -> Layer: shared_layers.0.bias | Grad Mean: 0.141451 | Grad Max: 0.822219 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001976 | Grad Max: 0.008985 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005475 | Grad Max: 0.005475 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001142 | Grad Max: 0.088225 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021273 | Grad Max: 0.498815 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.006333 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009781 | Grad Max: 0.035736 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000621 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002288 | Grad Max: 0.005705 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000335 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000851 | Grad Max: 0.002294 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001991 | Grad Max: 0.004639 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029145 | Grad Max: 0.029145 [GRADIENT NORM TOTAL] 2.9138 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.264 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010153 0.4989847] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 2/2046 | B: 161/1695 | C: 238/1810 [LOSS Ex1] A: 0.68059 | B: 0.68290 | C: 0.67958 [LOGITS Ex2 A] Mean Abs: 1.163 | Max: 4.388 [LOSS Ex2] A: 0.34484 | B: 0.42515 | C: 0.40298 ** [JOINT LOSS] ** : 1.072014 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004691 | Grad Max: 0.109689 -> Layer: shared_layers.0.bias | Grad Mean: 0.233186 | Grad Max: 1.348753 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.010687 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022287 | Grad Max: 0.022287 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001887 | Grad Max: 0.105074 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036115 | Grad Max: 0.595891 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000477 | Grad Max: 0.011207 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016920 | Grad Max: 0.057218 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001038 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003937 | Grad Max: 0.009340 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000045 | Grad Max: 0.000573 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001460 | Grad Max: 0.003957 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003597 | Grad Max: 0.007211 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051129 | Grad Max: 0.051129 [GRADIENT NORM TOTAL] 4.7276 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.270 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068286 0.49317142] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 8/2040 | B: 162/1886 | C: 273/1775 [LOSS Ex1] A: 0.67971 | B: 0.68252 | C: 0.67629 [LOGITS Ex2 A] Mean Abs: 1.129 | Max: 5.141 [LOSS Ex2] A: 0.34137 | B: 0.44492 | C: 0.39036 ** [JOINT LOSS] ** : 1.071722 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003104 | Grad Max: 0.092938 -> Layer: shared_layers.0.bias | Grad Mean: 0.096159 | Grad Max: 0.508472 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.010244 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009877 | Grad Max: 0.009877 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000877 | Grad Max: 0.051578 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015950 | Grad Max: 0.296624 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000203 | Grad Max: 0.005048 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006989 | Grad Max: 0.024248 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000545 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001643 | Grad Max: 0.004315 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000242 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000607 | Grad Max: 0.001686 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001414 | Grad Max: 0.003459 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020390 | Grad Max: 0.020390 [GRADIENT NORM TOTAL] 1.9990 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.235 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50885624 0.49114373] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.012 [MASKS] A(Pass/Fail): 4/2044 | B: 179/1869 | C: 217/1831 [LOSS Ex1] A: 0.68183 | B: 0.68285 | C: 0.68013 [LOGITS Ex2 A] Mean Abs: 1.093 | Max: 4.553 [LOSS Ex2] A: 0.33116 | B: 0.43989 | C: 0.41554 ** [JOINT LOSS] ** : 1.077131 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004793 | Grad Max: 0.112597 -> Layer: shared_layers.0.bias | Grad Mean: 0.205301 | Grad Max: 1.161624 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001753 | Grad Max: 0.008372 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008306 | Grad Max: 0.008306 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001723 | Grad Max: 0.099157 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032403 | Grad Max: 0.550420 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000421 | Grad Max: 0.010334 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014826 | Grad Max: 0.053305 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000930 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003482 | Grad Max: 0.007949 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000483 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001284 | Grad Max: 0.003413 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003329 | Grad Max: 0.006582 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045001 | Grad Max: 0.045001 [GRADIENT NORM TOTAL] 4.1859 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.165 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093225 0.49067745] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.012 [MASKS] A(Pass/Fail): 7/2041 | B: 164/1884 | C: 251/1797 [LOSS Ex1] A: 0.68141 | B: 0.68085 | C: 0.67695 [LOGITS Ex2 A] Mean Abs: 1.053 | Max: 5.045 [LOSS Ex2] A: 0.34342 | B: 0.42177 | C: 0.41223 ** [JOINT LOSS] ** : 1.072212 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006219 | Grad Max: 0.158590 -> Layer: shared_layers.0.bias | Grad Mean: 0.286456 | Grad Max: 1.626035 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001941 | Grad Max: 0.008696 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003383 | Grad Max: 0.003383 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002395 | Grad Max: 0.123388 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044711 | Grad Max: 0.696445 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000580 | Grad Max: 0.014720 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020470 | Grad Max: 0.076562 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000105 | Grad Max: 0.001424 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004803 | Grad Max: 0.011289 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000055 | Grad Max: 0.000623 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001768 | Grad Max: 0.004578 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004434 | Grad Max: 0.008635 -> Layer: exit2_layers.12.bias | Grad Mean: 0.061091 | Grad Max: 0.061091 [GRADIENT NORM TOTAL] 5.7551 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.246 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54003197 0.45996803] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 6/2042 | B: 162/1694 | C: 238/1810 [LOSS Ex1] A: 0.68067 | B: 0.68282 | C: 0.67904 [LOGITS Ex2 A] Mean Abs: 1.108 | Max: 4.748 [LOSS Ex2] A: 0.33166 | B: 0.41671 | C: 0.41229 ** [JOINT LOSS] ** : 1.067725 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003435 | Grad Max: 0.093360 -> Layer: shared_layers.0.bias | Grad Mean: 0.156627 | Grad Max: 0.853228 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.009988 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017613 | Grad Max: 0.017613 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001298 | Grad Max: 0.076608 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024074 | Grad Max: 0.419716 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.007940 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011088 | Grad Max: 0.041023 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000831 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002600 | Grad Max: 0.006928 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000363 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000950 | Grad Max: 0.002446 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002325 | Grad Max: 0.004844 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031778 | Grad Max: 0.031778 [GRADIENT NORM TOTAL] 3.1529 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.278 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.504959 0.495041] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 3/2045 | B: 166/1882 | C: 234/1814 [LOSS Ex1] A: 0.68234 | B: 0.68244 | C: 0.67786 [LOGITS Ex2 A] Mean Abs: 1.145 | Max: 4.664 [LOSS Ex2] A: 0.33370 | B: 0.45170 | C: 0.40808 ** [JOINT LOSS] ** : 1.078710 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002645 | Grad Max: 0.059407 -> Layer: shared_layers.0.bias | Grad Mean: 0.154051 | Grad Max: 0.878194 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001860 | Grad Max: 0.008717 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010361 | Grad Max: 0.010361 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001193 | Grad Max: 0.060841 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022955 | Grad Max: 0.327884 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000306 | Grad Max: 0.006861 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010970 | Grad Max: 0.038012 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000722 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002528 | Grad Max: 0.006021 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000323 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000924 | Grad Max: 0.002381 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002085 | Grad Max: 0.004199 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031054 | Grad Max: 0.031054 [GRADIENT NORM TOTAL] 3.0497 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.111 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5394925 0.46050745] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 4/2044 | B: 181/1867 | C: 159/1217 [LOSS Ex1] A: 0.68122 | B: 0.68277 | C: 0.67853 [LOGITS Ex2 A] Mean Abs: 1.169 | Max: 4.714 [LOSS Ex2] A: 0.35059 | B: 0.44726 | C: 0.38413 ** [JOINT LOSS] ** : 1.074835 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004057 | Grad Max: 0.087677 -> Layer: shared_layers.0.bias | Grad Mean: 0.215651 | Grad Max: 1.206745 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001806 | Grad Max: 0.007807 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003990 | Grad Max: 0.003990 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001694 | Grad Max: 0.109191 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032105 | Grad Max: 0.622403 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.008826 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014447 | Grad Max: 0.048421 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000881 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003355 | Grad Max: 0.007556 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000527 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001233 | Grad Max: 0.003635 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002915 | Grad Max: 0.006144 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042071 | Grad Max: 0.042071 [GRADIENT NORM TOTAL] 4.3702 [EPOCH SUMMARY] Train Loss: 1.0765 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0466 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0547 -> New: 1.0466) ############################## EPOCH 26/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.154 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5222757 0.47772428] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 3/1613 | B: 168/1880 | C: 232/1816 [LOSS Ex1] A: 0.68043 | B: 0.68077 | C: 0.67942 [LOGITS Ex2 A] Mean Abs: 1.179 | Max: 4.736 [LOSS Ex2] A: 0.31987 | B: 0.42206 | C: 0.41540 ** [JOINT LOSS] ** : 1.065983 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002794 | Grad Max: 0.069399 -> Layer: shared_layers.0.bias | Grad Mean: 0.147425 | Grad Max: 0.823084 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001969 | Grad Max: 0.009595 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010339 | Grad Max: 0.010339 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001128 | Grad Max: 0.063463 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021574 | Grad Max: 0.361216 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000270 | Grad Max: 0.006104 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009689 | Grad Max: 0.032068 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000611 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002250 | Grad Max: 0.005465 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000351 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000827 | Grad Max: 0.002445 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001874 | Grad Max: 0.004266 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028363 | Grad Max: 0.028363 [GRADIENT NORM TOTAL] 2.9036 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.279 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5089412 0.49105883] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 3/2045 | B: 163/1693 | C: 223/1825 [LOSS Ex1] A: 0.68078 | B: 0.68274 | C: 0.67913 [LOGITS Ex2 A] Mean Abs: 1.127 | Max: 4.959 [LOSS Ex2] A: 0.33406 | B: 0.42404 | C: 0.40502 ** [JOINT LOSS] ** : 1.068590 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003014 | Grad Max: 0.074776 -> Layer: shared_layers.0.bias | Grad Mean: 0.141641 | Grad Max: 0.781797 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.009421 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012508 | Grad Max: 0.012508 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001125 | Grad Max: 0.070009 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020960 | Grad Max: 0.395921 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000279 | Grad Max: 0.007395 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009915 | Grad Max: 0.039264 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000809 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002316 | Grad Max: 0.006279 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000344 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000844 | Grad Max: 0.002218 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001999 | Grad Max: 0.004568 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028150 | Grad Max: 0.028150 [GRADIENT NORM TOTAL] 2.8408 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.266 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008837 0.49911627] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 5/2043 | B: 168/1880 | C: 258/1790 [LOSS Ex1] A: 0.68039 | B: 0.68236 | C: 0.67770 [LOGITS Ex2 A] Mean Abs: 1.105 | Max: 4.665 [LOSS Ex2] A: 0.33782 | B: 0.45862 | C: 0.38589 ** [JOINT LOSS] ** : 1.074261 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002964 | Grad Max: 0.099836 -> Layer: shared_layers.0.bias | Grad Mean: 0.207623 | Grad Max: 1.178064 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002046 | Grad Max: 0.010047 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015443 | Grad Max: 0.015443 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001602 | Grad Max: 0.104481 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030230 | Grad Max: 0.598527 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000398 | Grad Max: 0.009145 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014243 | Grad Max: 0.051371 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000951 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003319 | Grad Max: 0.008358 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000451 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001221 | Grad Max: 0.003261 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002996 | Grad Max: 0.005787 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041970 | Grad Max: 0.041970 [GRADIENT NORM TOTAL] 4.2206 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.273 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067142 0.49328578] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 9/2039 | B: 181/1867 | C: 293/1755 [LOSS Ex1] A: 0.67951 | B: 0.68269 | C: 0.67415 [LOGITS Ex2 A] Mean Abs: 1.119 | Max: 5.659 [LOSS Ex2] A: 0.33860 | B: 0.42963 | C: 0.39251 ** [JOINT LOSS] ** : 1.065698 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.050281 -> Layer: shared_layers.0.bias | Grad Mean: 0.079348 | Grad Max: 0.454141 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.009495 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004054 | Grad Max: 0.004054 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000695 | Grad Max: 0.050350 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011855 | Grad Max: 0.280168 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000136 | Grad Max: 0.004466 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004770 | Grad Max: 0.021603 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000424 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001087 | Grad Max: 0.002955 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000194 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000401 | Grad Max: 0.001210 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001092 | Grad Max: 0.002921 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014985 | Grad Max: 0.014985 [GRADIENT NORM TOTAL] 1.6347 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.237 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50877744 0.49122256] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.513 | Std: 0.012 [MASKS] A(Pass/Fail): 4/2044 | B: 168/1880 | C: 231/1817 [LOSS Ex1] A: 0.68167 | B: 0.68068 | C: 0.67787 [LOGITS Ex2 A] Mean Abs: 1.151 | Max: 5.028 [LOSS Ex2] A: 0.33775 | B: 0.42573 | C: 0.40620 ** [JOINT LOSS] ** : 1.069965 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003834 | Grad Max: 0.093984 -> Layer: shared_layers.0.bias | Grad Mean: 0.196621 | Grad Max: 1.129699 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.007691 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001742 | Grad Max: 0.001742 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001578 | Grad Max: 0.090020 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029790 | Grad Max: 0.511899 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.008889 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013790 | Grad Max: 0.048233 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000926 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003205 | Grad Max: 0.007400 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000440 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001174 | Grad Max: 0.003003 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002706 | Grad Max: 0.005721 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039634 | Grad Max: 0.039634 [GRADIENT NORM TOTAL] 3.9795 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.036 | Max: 0.167 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50959677 0.49040323] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.513 | Std: 0.012 [MASKS] A(Pass/Fail): 8/2040 | B: 163/1693 | C: 230/1818 [LOSS Ex1] A: 0.68125 | B: 0.68266 | C: 0.67860 [LOGITS Ex2 A] Mean Abs: 1.114 | Max: 4.910 [LOSS Ex2] A: 0.34906 | B: 0.42442 | C: 0.41097 ** [JOINT LOSS] ** : 1.075652 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005779 | Grad Max: 0.124677 -> Layer: shared_layers.0.bias | Grad Mean: 0.240921 | Grad Max: 1.360271 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001806 | Grad Max: 0.008999 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007298 | Grad Max: 0.007298 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001990 | Grad Max: 0.114190 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037368 | Grad Max: 0.627017 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000484 | Grad Max: 0.011616 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017072 | Grad Max: 0.058067 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.001168 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003975 | Grad Max: 0.010223 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000045 | Grad Max: 0.000538 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001465 | Grad Max: 0.003842 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003647 | Grad Max: 0.007000 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051487 | Grad Max: 0.051487 [GRADIENT NORM TOTAL] 4.8180 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.249 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54086053 0.45913947] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 6/2042 | B: 169/1879 | C: 241/1807 [LOSS Ex1] A: 0.68047 | B: 0.68228 | C: 0.67830 [LOGITS Ex2 A] Mean Abs: 1.133 | Max: 4.978 [LOSS Ex2] A: 0.32888 | B: 0.44624 | C: 0.40803 ** [JOINT LOSS] ** : 1.074732 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003039 | Grad Max: 0.082605 -> Layer: shared_layers.0.bias | Grad Mean: 0.111221 | Grad Max: 0.578430 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.009809 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016003 | Grad Max: 0.016003 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000907 | Grad Max: 0.055410 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016762 | Grad Max: 0.298375 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000220 | Grad Max: 0.006533 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007764 | Grad Max: 0.030210 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000575 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001803 | Grad Max: 0.004998 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000283 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000659 | Grad Max: 0.001855 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001501 | Grad Max: 0.003785 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022341 | Grad Max: 0.022341 [GRADIENT NORM TOTAL] 2.2171 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.281 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5048996 0.49510044] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 3/2045 | B: 181/1867 | C: 241/1807 [LOSS Ex1] A: 0.68217 | B: 0.68261 | C: 0.67725 [LOGITS Ex2 A] Mean Abs: 1.109 | Max: 4.642 [LOSS Ex2] A: 0.33004 | B: 0.42629 | C: 0.41351 ** [JOINT LOSS] ** : 1.070621 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006898 | Grad Max: 0.184518 -> Layer: shared_layers.0.bias | Grad Mean: 0.232190 | Grad Max: 1.348892 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001797 | Grad Max: 0.008291 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004338 | Grad Max: 0.004338 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.141033 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037615 | Grad Max: 0.736018 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000474 | Grad Max: 0.011104 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016737 | Grad Max: 0.061166 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001194 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003942 | Grad Max: 0.010395 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000577 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001441 | Grad Max: 0.003773 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003555 | Grad Max: 0.006836 -> Layer: exit2_layers.12.bias | Grad Mean: 0.049491 | Grad Max: 0.049491 [GRADIENT NORM TOTAL] 4.7316 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.113 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54026365 0.45973638] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 4/2044 | B: 168/1880 | C: 228/1820 [LOSS Ex1] A: 0.68102 | B: 0.68059 | C: 0.67952 [LOGITS Ex2 A] Mean Abs: 1.115 | Max: 5.139 [LOSS Ex2] A: 0.36052 | B: 0.43536 | C: 0.42603 ** [JOINT LOSS] ** : 1.087679 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007674 | Grad Max: 0.209075 -> Layer: shared_layers.0.bias | Grad Mean: 0.316458 | Grad Max: 1.701007 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001883 | Grad Max: 0.009175 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007063 | Grad Max: 0.007063 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002564 | Grad Max: 0.139681 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048443 | Grad Max: 0.733838 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000630 | Grad Max: 0.014286 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022265 | Grad Max: 0.075175 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001430 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005204 | Grad Max: 0.011833 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000059 | Grad Max: 0.000724 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001917 | Grad Max: 0.004871 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004753 | Grad Max: 0.008933 -> Layer: exit2_layers.12.bias | Grad Mean: 0.066770 | Grad Max: 0.066770 [GRADIENT NORM TOTAL] 6.1865 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.156 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52283317 0.4771669 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 3/1613 | B: 164/1692 | C: 240/1808 [LOSS Ex1] A: 0.68023 | B: 0.68257 | C: 0.67863 [LOGITS Ex2 A] Mean Abs: 1.154 | Max: 4.757 [LOSS Ex2] A: 0.32360 | B: 0.41532 | C: 0.41223 ** [JOINT LOSS] ** : 1.064195 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005550 | Grad Max: 0.154371 -> Layer: shared_layers.0.bias | Grad Mean: 0.223447 | Grad Max: 1.171280 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001887 | Grad Max: 0.009523 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008248 | Grad Max: 0.008248 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001841 | Grad Max: 0.100727 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034125 | Grad Max: 0.538353 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000434 | Grad Max: 0.010027 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015311 | Grad Max: 0.051062 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000962 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003604 | Grad Max: 0.008134 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000499 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001319 | Grad Max: 0.003677 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003374 | Grad Max: 0.006922 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045826 | Grad Max: 0.045826 [GRADIENT NORM TOTAL] 4.4147 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.282 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50891906 0.49108094] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 3/2045 | B: 170/1878 | C: 212/1836 [LOSS Ex1] A: 0.68058 | B: 0.68219 | C: 0.68051 [LOGITS Ex2 A] Mean Abs: 1.172 | Max: 5.579 [LOSS Ex2] A: 0.34019 | B: 0.44472 | C: 0.42590 ** [JOINT LOSS] ** : 1.084698 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.044449 -> Layer: shared_layers.0.bias | Grad Mean: 0.116249 | Grad Max: 0.713034 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001923 | Grad Max: 0.009627 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015035 | Grad Max: 0.015035 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000911 | Grad Max: 0.084594 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017137 | Grad Max: 0.465588 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000212 | Grad Max: 0.005276 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007598 | Grad Max: 0.026091 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000523 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001718 | Grad Max: 0.004148 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000244 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000622 | Grad Max: 0.001666 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001336 | Grad Max: 0.002969 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020879 | Grad Max: 0.020879 [GRADIENT NORM TOTAL] 2.4080 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.269 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007744 0.49922562] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 5/2043 | B: 182/1866 | C: 247/1801 [LOSS Ex1] A: 0.68019 | B: 0.68252 | C: 0.67869 [LOGITS Ex2 A] Mean Abs: 1.171 | Max: 4.494 [LOSS Ex2] A: 0.34103 | B: 0.44336 | C: 0.39295 ** [JOINT LOSS] ** : 1.072915 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004463 | Grad Max: 0.113411 -> Layer: shared_layers.0.bias | Grad Mean: 0.217494 | Grad Max: 1.301247 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.010451 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019571 | Grad Max: 0.019571 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001764 | Grad Max: 0.107975 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033429 | Grad Max: 0.611692 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000423 | Grad Max: 0.011086 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015087 | Grad Max: 0.056345 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000898 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003502 | Grad Max: 0.007855 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000462 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001287 | Grad Max: 0.003497 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002987 | Grad Max: 0.006420 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043772 | Grad Max: 0.043772 [GRADIENT NORM TOTAL] 4.4461 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.275 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50662386 0.4933761 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 10/2038 | B: 173/1875 | C: 252/1796 [LOSS Ex1] A: 0.67930 | B: 0.68050 | C: 0.67670 [LOGITS Ex2 A] Mean Abs: 1.155 | Max: 5.816 [LOSS Ex2] A: 0.34571 | B: 0.41956 | C: 0.39320 ** [JOINT LOSS] ** : 1.064989 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003437 | Grad Max: 0.077904 -> Layer: shared_layers.0.bias | Grad Mean: 0.093541 | Grad Max: 0.514497 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.009913 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007770 | Grad Max: 0.007770 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000855 | Grad Max: 0.053902 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015670 | Grad Max: 0.305307 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000193 | Grad Max: 0.004575 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006776 | Grad Max: 0.020363 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000524 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001592 | Grad Max: 0.004377 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000253 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000585 | Grad Max: 0.001682 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001333 | Grad Max: 0.003443 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019792 | Grad Max: 0.019792 [GRADIENT NORM TOTAL] 1.9453 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.238 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50872445 0.49127558] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 4/2044 | B: 164/1692 | C: 157/1219 [LOSS Ex1] A: 0.68151 | B: 0.68249 | C: 0.67950 [LOGITS Ex2 A] Mean Abs: 1.095 | Max: 5.129 [LOSS Ex2] A: 0.33554 | B: 0.41415 | C: 0.40115 ** [JOINT LOSS] ** : 1.064775 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004687 | Grad Max: 0.121150 -> Layer: shared_layers.0.bias | Grad Mean: 0.207749 | Grad Max: 1.165584 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001766 | Grad Max: 0.008440 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006534 | Grad Max: 0.006534 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001700 | Grad Max: 0.090499 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032173 | Grad Max: 0.509344 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000412 | Grad Max: 0.010148 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014671 | Grad Max: 0.053669 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000870 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003416 | Grad Max: 0.007992 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000464 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001240 | Grad Max: 0.003267 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002983 | Grad Max: 0.006202 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042074 | Grad Max: 0.042074 [GRADIENT NORM TOTAL] 4.2033 [EPOCH SUMMARY] Train Loss: 1.0718 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0508 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 27/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.168 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50987583 0.4901242 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.012 [MASKS] A(Pass/Fail): 10/2038 | B: 172/1876 | C: 254/1794 [LOSS Ex1] A: 0.68109 | B: 0.68211 | C: 0.67703 [LOGITS Ex2 A] Mean Abs: 1.061 | Max: 4.979 [LOSS Ex2] A: 0.33835 | B: 0.45066 | C: 0.41134 ** [JOINT LOSS] ** : 1.080189 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005637 | Grad Max: 0.141784 -> Layer: shared_layers.0.bias | Grad Mean: 0.277865 | Grad Max: 1.576257 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001881 | Grad Max: 0.009068 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006187 | Grad Max: 0.006187 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.129354 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042604 | Grad Max: 0.717589 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000547 | Grad Max: 0.013792 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019487 | Grad Max: 0.066817 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001230 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004536 | Grad Max: 0.011031 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000575 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001665 | Grad Max: 0.004123 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004113 | Grad Max: 0.007804 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057496 | Grad Max: 0.057496 [GRADIENT NORM TOTAL] 5.6219 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.251 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5417202 0.45827976] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 6/2042 | B: 182/1866 | C: 245/1803 [LOSS Ex1] A: 0.68027 | B: 0.68244 | C: 0.67842 [LOGITS Ex2 A] Mean Abs: 1.119 | Max: 4.913 [LOSS Ex2] A: 0.31206 | B: 0.43696 | C: 0.40551 ** [JOINT LOSS] ** : 1.065219 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002771 | Grad Max: 0.067635 -> Layer: shared_layers.0.bias | Grad Mean: 0.107865 | Grad Max: 0.640462 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001934 | Grad Max: 0.009344 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013410 | Grad Max: 0.013410 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000951 | Grad Max: 0.069632 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018121 | Grad Max: 0.385997 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.005340 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008289 | Grad Max: 0.029245 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000565 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001929 | Grad Max: 0.004814 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000299 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000707 | Grad Max: 0.002007 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001774 | Grad Max: 0.004197 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024497 | Grad Max: 0.024497 [GRADIENT NORM TOTAL] 2.2718 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.283 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5047949 0.4952051] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 4/2044 | B: 173/1875 | C: 242/1806 [LOSS Ex1] A: 0.68201 | B: 0.68041 | C: 0.67855 [LOGITS Ex2 A] Mean Abs: 1.157 | Max: 4.777 [LOSS Ex2] A: 0.33942 | B: 0.42848 | C: 0.40490 ** [JOINT LOSS] ** : 1.071254 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004375 | Grad Max: 0.103616 -> Layer: shared_layers.0.bias | Grad Mean: 0.200073 | Grad Max: 1.124629 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001884 | Grad Max: 0.008871 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010080 | Grad Max: 0.010080 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001624 | Grad Max: 0.092447 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030964 | Grad Max: 0.520041 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000409 | Grad Max: 0.009812 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014680 | Grad Max: 0.052220 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000884 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003390 | Grad Max: 0.007465 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000443 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001236 | Grad Max: 0.003278 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002918 | Grad Max: 0.005823 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042163 | Grad Max: 0.042163 [GRADIENT NORM TOTAL] 4.0637 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.114 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409866 0.4590134] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 6/2042 | B: 164/1692 | C: 256/1792 [LOSS Ex1] A: 0.68086 | B: 0.68240 | C: 0.67662 [LOGITS Ex2 A] Mean Abs: 1.198 | Max: 5.065 [LOSS Ex2] A: 0.34927 | B: 0.42088 | C: 0.39175 ** [JOINT LOSS] ** : 1.067259 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005709 | Grad Max: 0.129943 -> Layer: shared_layers.0.bias | Grad Mean: 0.264654 | Grad Max: 1.455758 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001888 | Grad Max: 0.008819 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003805 | Grad Max: 0.003805 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.130038 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039902 | Grad Max: 0.725790 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000512 | Grad Max: 0.012422 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018313 | Grad Max: 0.063152 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001195 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004261 | Grad Max: 0.010427 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000575 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001559 | Grad Max: 0.004131 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003746 | Grad Max: 0.006913 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053628 | Grad Max: 0.053628 [GRADIENT NORM TOTAL] 5.2881 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.158 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5233412 0.47665882] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 4/1612 | B: 173/1875 | C: 229/1819 [LOSS Ex1] A: 0.68006 | B: 0.68202 | C: 0.67906 [LOGITS Ex2 A] Mean Abs: 1.193 | Max: 4.902 [LOSS Ex2] A: 0.32425 | B: 0.44516 | C: 0.39063 ** [JOINT LOSS] ** : 1.067062 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004015 | Grad Max: 0.096731 -> Layer: shared_layers.0.bias | Grad Mean: 0.170559 | Grad Max: 0.961364 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.010011 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013301 | Grad Max: 0.013301 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001394 | Grad Max: 0.073601 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026693 | Grad Max: 0.409170 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000341 | Grad Max: 0.010517 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012285 | Grad Max: 0.047445 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000751 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002843 | Grad Max: 0.006685 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000380 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001044 | Grad Max: 0.002733 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002388 | Grad Max: 0.005043 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035951 | Grad Max: 0.035951 [GRADIENT NORM TOTAL] 3.4539 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.284 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5088399 0.49116012] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 3/2045 | B: 184/1864 | C: 251/1797 [LOSS Ex1] A: 0.68040 | B: 0.68236 | C: 0.67716 [LOGITS Ex2 A] Mean Abs: 1.139 | Max: 5.975 [LOSS Ex2] A: 0.33513 | B: 0.43080 | C: 0.39491 ** [JOINT LOSS] ** : 1.066920 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002912 | Grad Max: 0.074218 -> Layer: shared_layers.0.bias | Grad Mean: 0.134808 | Grad Max: 0.775246 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001892 | Grad Max: 0.008946 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007361 | Grad Max: 0.007361 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001092 | Grad Max: 0.076937 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020505 | Grad Max: 0.432948 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000262 | Grad Max: 0.007671 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009363 | Grad Max: 0.036072 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000650 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002180 | Grad Max: 0.005046 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000288 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000793 | Grad Max: 0.002162 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001964 | Grad Max: 0.004524 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027330 | Grad Max: 0.027330 [GRADIENT NORM TOTAL] 2.7620 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.271 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50060904 0.4993909 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 6/2042 | B: 175/1873 | C: 244/1804 [LOSS Ex1] A: 0.67999 | B: 0.68032 | C: 0.67805 [LOGITS Ex2 A] Mean Abs: 1.130 | Max: 4.663 [LOSS Ex2] A: 0.34104 | B: 0.42685 | C: 0.42305 ** [JOINT LOSS] ** : 1.076432 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004062 | Grad Max: 0.099994 -> Layer: shared_layers.0.bias | Grad Mean: 0.217314 | Grad Max: 1.220994 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002021 | Grad Max: 0.010039 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010607 | Grad Max: 0.010607 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001713 | Grad Max: 0.106623 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032218 | Grad Max: 0.604028 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000415 | Grad Max: 0.009433 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014887 | Grad Max: 0.050978 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.001004 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003457 | Grad Max: 0.009137 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000407 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001265 | Grad Max: 0.003202 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003064 | Grad Max: 0.005562 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043052 | Grad Max: 0.043052 [GRADIENT NORM TOTAL] 4.3812 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.277 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50649977 0.49350023] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 11/2037 | B: 164/1692 | C: 222/1826 [LOSS Ex1] A: 0.67910 | B: 0.68232 | C: 0.67854 [LOGITS Ex2 A] Mean Abs: 1.136 | Max: 5.058 [LOSS Ex2] A: 0.32897 | B: 0.41481 | C: 0.43137 ** [JOINT LOSS] ** : 1.071707 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001924 | Grad Max: 0.048165 -> Layer: shared_layers.0.bias | Grad Mean: 0.090937 | Grad Max: 0.533452 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.009730 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008116 | Grad Max: 0.008116 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000771 | Grad Max: 0.046267 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013571 | Grad Max: 0.258440 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000173 | Grad Max: 0.005542 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006166 | Grad Max: 0.025590 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000511 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001408 | Grad Max: 0.003717 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000221 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000513 | Grad Max: 0.001379 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001286 | Grad Max: 0.003369 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018212 | Grad Max: 0.018212 [GRADIENT NORM TOTAL] 1.9061 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.240 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.508641 0.49135903] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 4/2044 | B: 173/1875 | C: 211/1837 [LOSS Ex1] A: 0.68135 | B: 0.68194 | C: 0.67906 [LOGITS Ex2 A] Mean Abs: 1.154 | Max: 5.050 [LOSS Ex2] A: 0.33550 | B: 0.46291 | C: 0.38133 ** [JOINT LOSS] ** : 1.074035 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004003 | Grad Max: 0.094192 -> Layer: shared_layers.0.bias | Grad Mean: 0.204169 | Grad Max: 1.140155 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001761 | Grad Max: 0.008040 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005614 | Grad Max: 0.005614 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001590 | Grad Max: 0.096511 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030237 | Grad Max: 0.513325 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.010310 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013834 | Grad Max: 0.054223 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000813 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003195 | Grad Max: 0.007197 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000434 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001166 | Grad Max: 0.002976 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002750 | Grad Max: 0.006385 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040376 | Grad Max: 0.040376 [GRADIENT NORM TOTAL] 4.0545 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.169 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5101792 0.48982075] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 11/2037 | B: 185/1863 | C: 239/1809 [LOSS Ex1] A: 0.68093 | B: 0.68228 | C: 0.67897 [LOGITS Ex2 A] Mean Abs: 1.138 | Max: 4.614 [LOSS Ex2] A: 0.33844 | B: 0.44249 | C: 0.38726 ** [JOINT LOSS] ** : 1.070127 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004404 | Grad Max: 0.108500 -> Layer: shared_layers.0.bias | Grad Mean: 0.251120 | Grad Max: 1.430617 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001834 | Grad Max: 0.009010 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008466 | Grad Max: 0.008466 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001954 | Grad Max: 0.116662 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037311 | Grad Max: 0.651976 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000485 | Grad Max: 0.011306 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017445 | Grad Max: 0.062896 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001115 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004037 | Grad Max: 0.008983 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000541 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001484 | Grad Max: 0.004060 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003441 | Grad Max: 0.007188 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050738 | Grad Max: 0.050738 [GRADIENT NORM TOTAL] 5.0619 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.253 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54257405 0.45742592] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.013 [MASKS] A(Pass/Fail): 6/2042 | B: 175/1873 | C: 231/1817 [LOSS Ex1] A: 0.68007 | B: 0.68023 | C: 0.67846 [LOGITS Ex2 A] Mean Abs: 1.142 | Max: 4.906 [LOSS Ex2] A: 0.32518 | B: 0.42297 | C: 0.40858 ** [JOINT LOSS] ** : 1.065165 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002194 | Grad Max: 0.051005 -> Layer: shared_layers.0.bias | Grad Mean: 0.131753 | Grad Max: 0.783788 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001982 | Grad Max: 0.009502 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011641 | Grad Max: 0.011641 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001030 | Grad Max: 0.071251 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019390 | Grad Max: 0.405110 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000253 | Grad Max: 0.006554 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009142 | Grad Max: 0.034882 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000589 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002113 | Grad Max: 0.005066 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000310 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000776 | Grad Max: 0.002129 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001757 | Grad Max: 0.004363 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026461 | Grad Max: 0.026461 [GRADIENT NORM TOTAL] 2.7437 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.286 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50471705 0.49528295] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 5/2043 | B: 165/1691 | C: 275/1773 [LOSS Ex1] A: 0.68184 | B: 0.68224 | C: 0.67511 [LOGITS Ex2 A] Mean Abs: 1.123 | Max: 5.052 [LOSS Ex2] A: 0.33360 | B: 0.41755 | C: 0.39027 ** [JOINT LOSS] ** : 1.060204 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005997 | Grad Max: 0.167012 -> Layer: shared_layers.0.bias | Grad Mean: 0.179769 | Grad Max: 0.933191 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001854 | Grad Max: 0.008602 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007307 | Grad Max: 0.007307 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001574 | Grad Max: 0.095344 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028494 | Grad Max: 0.482856 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.007773 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012490 | Grad Max: 0.041355 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000794 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002962 | Grad Max: 0.006787 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000428 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001070 | Grad Max: 0.002785 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002689 | Grad Max: 0.005348 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036136 | Grad Max: 0.036136 [GRADIENT NORM TOTAL] 3.5955 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.116 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54175127 0.45824873] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 6/2042 | B: 173/1875 | C: 232/1816 [LOSS Ex1] A: 0.68068 | B: 0.68186 | C: 0.67845 [LOGITS Ex2 A] Mean Abs: 1.117 | Max: 5.120 [LOSS Ex2] A: 0.33507 | B: 0.44387 | C: 0.41628 ** [JOINT LOSS] ** : 1.078735 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005736 | Grad Max: 0.144012 -> Layer: shared_layers.0.bias | Grad Mean: 0.246662 | Grad Max: 1.330100 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001902 | Grad Max: 0.009013 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009992 | Grad Max: 0.009992 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.117875 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037865 | Grad Max: 0.654216 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000485 | Grad Max: 0.010034 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017295 | Grad Max: 0.056839 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001152 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004034 | Grad Max: 0.009263 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000587 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001472 | Grad Max: 0.003844 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003611 | Grad Max: 0.006741 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050324 | Grad Max: 0.050324 [GRADIENT NORM TOTAL] 4.9377 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.161 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5238829 0.47611713] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.013 [MASKS] A(Pass/Fail): 6/1610 | B: 185/1863 | C: 174/1202 [LOSS Ex1] A: 0.67987 | B: 0.68219 | C: 0.67796 [LOGITS Ex2 A] Mean Abs: 1.153 | Max: 4.635 [LOSS Ex2] A: 0.31776 | B: 0.43434 | C: 0.38981 ** [JOINT LOSS] ** : 1.060645 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002971 | Grad Max: 0.079283 -> Layer: shared_layers.0.bias | Grad Mean: 0.106956 | Grad Max: 0.581226 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001949 | Grad Max: 0.009745 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011892 | Grad Max: 0.011892 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000930 | Grad Max: 0.059576 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016884 | Grad Max: 0.327373 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000217 | Grad Max: 0.005668 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007647 | Grad Max: 0.026544 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000551 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001812 | Grad Max: 0.004557 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000317 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000665 | Grad Max: 0.001948 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001780 | Grad Max: 0.003952 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024057 | Grad Max: 0.024057 [GRADIENT NORM TOTAL] 2.2030 [EPOCH SUMMARY] Train Loss: 1.0696 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0436 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0466 -> New: 1.0436) ############################## EPOCH 28/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.287 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5087953 0.49120468] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 3/2045 | B: 177/1871 | C: 237/1811 [LOSS Ex1] A: 0.68022 | B: 0.68014 | C: 0.67862 [LOGITS Ex2 A] Mean Abs: 1.189 | Max: 5.127 [LOSS Ex2] A: 0.32237 | B: 0.41906 | C: 0.40774 ** [JOINT LOSS] ** : 1.062714 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003888 | Grad Max: 0.096682 -> Layer: shared_layers.0.bias | Grad Mean: 0.213532 | Grad Max: 1.253441 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001945 | Grad Max: 0.009294 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010937 | Grad Max: 0.010937 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001708 | Grad Max: 0.099755 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032354 | Grad Max: 0.567617 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.010112 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014647 | Grad Max: 0.052557 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.001022 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003394 | Grad Max: 0.008775 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000443 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001234 | Grad Max: 0.003349 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002893 | Grad Max: 0.005803 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042389 | Grad Max: 0.042389 [GRADIENT NORM TOTAL] 4.3385 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.273 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50046057 0.4995394 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 6/2042 | B: 165/1691 | C: 236/1812 [LOSS Ex1] A: 0.67979 | B: 0.68215 | C: 0.67871 [LOGITS Ex2 A] Mean Abs: 1.188 | Max: 4.765 [LOSS Ex2] A: 0.33251 | B: 0.42316 | C: 0.40627 ** [JOINT LOSS] ** : 1.067534 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006666 | Grad Max: 0.149986 -> Layer: shared_layers.0.bias | Grad Mean: 0.292075 | Grad Max: 1.710276 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001986 | Grad Max: 0.009873 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014084 | Grad Max: 0.014084 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002427 | Grad Max: 0.132277 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045689 | Grad Max: 0.720080 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000574 | Grad Max: 0.012746 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020476 | Grad Max: 0.068145 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001371 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004747 | Grad Max: 0.011397 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000610 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001730 | Grad Max: 0.004600 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004160 | Grad Max: 0.008174 -> Layer: exit2_layers.12.bias | Grad Mean: 0.059744 | Grad Max: 0.059744 [GRADIENT NORM TOTAL] 5.8828 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.279 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063853 0.4936147] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 11/2037 | B: 173/1875 | C: 234/1814 [LOSS Ex1] A: 0.67890 | B: 0.68177 | C: 0.67825 [LOGITS Ex2 A] Mean Abs: 1.170 | Max: 5.309 [LOSS Ex2] A: 0.34393 | B: 0.44673 | C: 0.37232 ** [JOINT LOSS] ** : 1.067298 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005107 | Grad Max: 0.151228 -> Layer: shared_layers.0.bias | Grad Mean: 0.163154 | Grad Max: 0.863652 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001991 | Grad Max: 0.009780 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009629 | Grad Max: 0.009629 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001384 | Grad Max: 0.082603 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025477 | Grad Max: 0.414629 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.006467 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011029 | Grad Max: 0.033880 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000777 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002597 | Grad Max: 0.006553 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000389 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000953 | Grad Max: 0.002707 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002232 | Grad Max: 0.004785 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032653 | Grad Max: 0.032653 [GRADIENT NORM TOTAL] 3.2410 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.242 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085627 0.49143732] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 5/2043 | B: 188/1860 | C: 240/1808 [LOSS Ex1] A: 0.68120 | B: 0.68211 | C: 0.67800 [LOGITS Ex2 A] Mean Abs: 1.113 | Max: 4.852 [LOSS Ex2] A: 0.31846 | B: 0.43517 | C: 0.41645 ** [JOINT LOSS] ** : 1.070464 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.061402 -> Layer: shared_layers.0.bias | Grad Mean: 0.114148 | Grad Max: 0.637683 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001802 | Grad Max: 0.008439 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007440 | Grad Max: 0.007440 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000908 | Grad Max: 0.052279 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016940 | Grad Max: 0.289205 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000219 | Grad Max: 0.006373 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007931 | Grad Max: 0.032347 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000546 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001835 | Grad Max: 0.004332 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000278 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000668 | Grad Max: 0.001908 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001779 | Grad Max: 0.004188 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024083 | Grad Max: 0.024083 [GRADIENT NORM TOTAL] 2.3024 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.037 | Max: 0.170 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51047885 0.48952112] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 12/2036 | B: 177/1871 | C: 229/1819 [LOSS Ex1] A: 0.68077 | B: 0.68004 | C: 0.67853 [LOGITS Ex2 A] Mean Abs: 1.077 | Max: 5.308 [LOSS Ex2] A: 0.33117 | B: 0.41601 | C: 0.40555 ** [JOINT LOSS] ** : 1.064028 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003385 | Grad Max: 0.087062 -> Layer: shared_layers.0.bias | Grad Mean: 0.190775 | Grad Max: 1.102121 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001866 | Grad Max: 0.008665 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004924 | Grad Max: 0.004924 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001492 | Grad Max: 0.088266 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028480 | Grad Max: 0.492203 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000375 | Grad Max: 0.009287 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013540 | Grad Max: 0.050835 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000803 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003132 | Grad Max: 0.007637 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000409 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001132 | Grad Max: 0.002948 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002749 | Grad Max: 0.006311 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038228 | Grad Max: 0.038228 [GRADIENT NORM TOTAL] 3.8431 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.256 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54343307 0.45656696] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.013 [MASKS] A(Pass/Fail): 8/2040 | B: 166/1690 | C: 254/1794 [LOSS Ex1] A: 0.67988 | B: 0.68207 | C: 0.67722 [LOGITS Ex2 A] Mean Abs: 1.143 | Max: 5.125 [LOSS Ex2] A: 0.32601 | B: 0.41456 | C: 0.38377 ** [JOINT LOSS] ** : 1.054505 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.040447 -> Layer: shared_layers.0.bias | Grad Mean: 0.084002 | Grad Max: 0.430196 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001991 | Grad Max: 0.010317 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015449 | Grad Max: 0.015449 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000680 | Grad Max: 0.053514 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011911 | Grad Max: 0.299942 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000134 | Grad Max: 0.004800 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004737 | Grad Max: 0.021567 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000415 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001077 | Grad Max: 0.003165 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000177 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000381 | Grad Max: 0.001225 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000861 | Grad Max: 0.002645 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012092 | Grad Max: 0.012092 [GRADIENT NORM TOTAL] 1.7562 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.288 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50460064 0.49539936] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 8/2040 | B: 173/1875 | C: 237/1811 [LOSS Ex1] A: 0.68169 | B: 0.68168 | C: 0.67814 [LOGITS Ex2 A] Mean Abs: 1.177 | Max: 4.518 [LOSS Ex2] A: 0.33050 | B: 0.45056 | C: 0.39552 ** [JOINT LOSS] ** : 1.072697 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003829 | Grad Max: 0.090729 -> Layer: shared_layers.0.bias | Grad Mean: 0.203930 | Grad Max: 1.134232 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.009250 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016457 | Grad Max: 0.016457 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001592 | Grad Max: 0.098199 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030599 | Grad Max: 0.552833 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000405 | Grad Max: 0.009619 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014633 | Grad Max: 0.052773 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000974 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003367 | Grad Max: 0.008326 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000481 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001216 | Grad Max: 0.003452 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002816 | Grad Max: 0.005718 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041341 | Grad Max: 0.041341 [GRADIENT NORM TOTAL] 4.0880 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.118 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54250455 0.45749545] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 11/2037 | B: 189/1859 | C: 236/1812 [LOSS Ex1] A: 0.68051 | B: 0.68202 | C: 0.67838 [LOGITS Ex2 A] Mean Abs: 1.179 | Max: 5.008 [LOSS Ex2] A: 0.32669 | B: 0.43986 | C: 0.37894 ** [JOINT LOSS] ** : 1.062133 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003490 | Grad Max: 0.087588 -> Layer: shared_layers.0.bias | Grad Mean: 0.223068 | Grad Max: 1.221784 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001792 | Grad Max: 0.008338 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004214 | Grad Max: 0.004214 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001711 | Grad Max: 0.110276 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032296 | Grad Max: 0.624411 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000419 | Grad Max: 0.010485 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015137 | Grad Max: 0.054231 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000989 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003502 | Grad Max: 0.008401 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000442 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001271 | Grad Max: 0.003243 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002871 | Grad Max: 0.006430 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043130 | Grad Max: 0.043130 [GRADIENT NORM TOTAL] 4.5648 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.163 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.524433 0.47556695] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.013 [MASKS] A(Pass/Fail): 7/1609 | B: 178/1870 | C: 244/1804 [LOSS Ex1] A: 0.67970 | B: 0.67995 | C: 0.67810 [LOGITS Ex2 A] Mean Abs: 1.224 | Max: 4.736 [LOSS Ex2] A: 0.30810 | B: 0.41725 | C: 0.36704 ** [JOINT LOSS] ** : 1.043379 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001824 | Grad Max: 0.051746 -> Layer: shared_layers.0.bias | Grad Mean: 0.105315 | Grad Max: 0.583130 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001984 | Grad Max: 0.009723 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009147 | Grad Max: 0.009147 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000827 | Grad Max: 0.048930 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015457 | Grad Max: 0.280295 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.006248 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007196 | Grad Max: 0.031528 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000554 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001656 | Grad Max: 0.004205 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000306 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000607 | Grad Max: 0.001931 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001154 | Grad Max: 0.003786 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019928 | Grad Max: 0.019928 [GRADIENT NORM TOTAL] 2.1507 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.289 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50870925 0.4912908 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 5/2043 | B: 169/1687 | C: 263/1785 [LOSS Ex1] A: 0.68003 | B: 0.68197 | C: 0.67571 [LOGITS Ex2 A] Mean Abs: 1.173 | Max: 5.070 [LOSS Ex2] A: 0.32614 | B: 0.41970 | C: 0.41432 ** [JOINT LOSS] ** : 1.065961 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006416 | Grad Max: 0.166731 -> Layer: shared_layers.0.bias | Grad Mean: 0.230997 | Grad Max: 1.263296 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.009452 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010968 | Grad Max: 0.010968 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001953 | Grad Max: 0.129903 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036552 | Grad Max: 0.680241 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000452 | Grad Max: 0.009645 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016107 | Grad Max: 0.053482 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.001022 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003757 | Grad Max: 0.009232 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000510 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001359 | Grad Max: 0.003636 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003324 | Grad Max: 0.006321 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046390 | Grad Max: 0.046390 [GRADIENT NORM TOTAL] 4.6730 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.275 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002699 0.49973008] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 12/2036 | B: 179/1869 | C: 250/1798 [LOSS Ex1] A: 0.67958 | B: 0.68159 | C: 0.67695 [LOGITS Ex2 A] Mean Abs: 1.159 | Max: 4.913 [LOSS Ex2] A: 0.33371 | B: 0.45441 | C: 0.43490 ** [JOINT LOSS] ** : 1.087051 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008072 | Grad Max: 0.207689 -> Layer: shared_layers.0.bias | Grad Mean: 0.337294 | Grad Max: 1.897505 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.009729 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011248 | Grad Max: 0.011248 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002810 | Grad Max: 0.164753 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053165 | Grad Max: 0.918640 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000666 | Grad Max: 0.015827 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023907 | Grad Max: 0.078456 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000116 | Grad Max: 0.001471 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005542 | Grad Max: 0.012873 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000060 | Grad Max: 0.000693 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002012 | Grad Max: 0.005078 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004925 | Grad Max: 0.010156 -> Layer: exit2_layers.12.bias | Grad Mean: 0.069178 | Grad Max: 0.069178 [GRADIENT NORM TOTAL] 6.8210 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.282 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062246 0.4937754] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 13/2035 | B: 194/1854 | C: 245/1803 [LOSS Ex1] A: 0.67870 | B: 0.68193 | C: 0.67587 [LOGITS Ex2 A] Mean Abs: 1.145 | Max: 5.193 [LOSS Ex2] A: 0.33039 | B: 0.43706 | C: 0.40186 ** [JOINT LOSS] ** : 1.068600 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005027 | Grad Max: 0.121496 -> Layer: shared_layers.0.bias | Grad Mean: 0.227211 | Grad Max: 1.301727 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.009497 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005922 | Grad Max: 0.005922 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001838 | Grad Max: 0.136711 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035021 | Grad Max: 0.755607 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.011709 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016200 | Grad Max: 0.058555 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001041 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003742 | Grad Max: 0.008594 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000453 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001351 | Grad Max: 0.003447 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003309 | Grad Max: 0.006329 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046854 | Grad Max: 0.046854 [GRADIENT NORM TOTAL] 4.5871 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.244 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084439 0.4915561] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 12/2036 | B: 186/1862 | C: 261/1787 [LOSS Ex1] A: 0.68104 | B: 0.67985 | C: 0.67602 [LOGITS Ex2 A] Mean Abs: 1.157 | Max: 4.583 [LOSS Ex2] A: 0.33219 | B: 0.42261 | C: 0.39711 ** [JOINT LOSS] ** : 1.062940 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001809 | Grad Max: 0.050939 -> Layer: shared_layers.0.bias | Grad Mean: 0.100730 | Grad Max: 0.577424 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001918 | Grad Max: 0.007667 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001884 | Grad Max: 0.001884 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000807 | Grad Max: 0.065569 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014680 | Grad Max: 0.378716 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.005186 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005983 | Grad Max: 0.025144 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000449 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001356 | Grad Max: 0.003604 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000211 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000486 | Grad Max: 0.001546 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000879 | Grad Max: 0.002923 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015109 | Grad Max: 0.015109 [GRADIENT NORM TOTAL] 2.1254 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.171 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108137 0.4891863] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.013 [MASKS] A(Pass/Fail): 15/2033 | B: 171/1685 | C: 149/1227 [LOSS Ex1] A: 0.68062 | B: 0.68189 | C: 0.67931 [LOGITS Ex2 A] Mean Abs: 1.147 | Max: 4.879 [LOSS Ex2] A: 0.32998 | B: 0.42412 | C: 0.39923 ** [JOINT LOSS] ** : 1.065050 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004701 | Grad Max: 0.131176 -> Layer: shared_layers.0.bias | Grad Mean: 0.219515 | Grad Max: 1.241413 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001807 | Grad Max: 0.008756 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009163 | Grad Max: 0.009163 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001683 | Grad Max: 0.093318 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032210 | Grad Max: 0.523780 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.010138 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014987 | Grad Max: 0.054274 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000858 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003452 | Grad Max: 0.007639 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000478 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001251 | Grad Max: 0.003263 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002876 | Grad Max: 0.006232 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042213 | Grad Max: 0.042213 [GRADIENT NORM TOTAL] 4.3236 [EPOCH SUMMARY] Train Loss: 1.0653 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0359 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0436 -> New: 1.0359) ############################## EPOCH 29/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54436916 0.45563087] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 15/2033 | B: 179/1869 | C: 228/1820 [LOSS Ex1] A: 0.67969 | B: 0.68150 | C: 0.67715 [LOGITS Ex2 A] Mean Abs: 1.177 | Max: 5.375 [LOSS Ex2] A: 0.31808 | B: 0.44849 | C: 0.38476 ** [JOINT LOSS] ** : 1.063223 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003643 | Grad Max: 0.108500 -> Layer: shared_layers.0.bias | Grad Mean: 0.125937 | Grad Max: 0.672440 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001991 | Grad Max: 0.009611 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012840 | Grad Max: 0.012840 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000991 | Grad Max: 0.054160 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018619 | Grad Max: 0.263913 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000239 | Grad Max: 0.006974 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008530 | Grad Max: 0.031279 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000533 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001971 | Grad Max: 0.004733 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000285 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000708 | Grad Max: 0.001948 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001528 | Grad Max: 0.003839 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023120 | Grad Max: 0.023120 [GRADIENT NORM TOTAL] 2.4632 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.291 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50448734 0.49551272] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 13/2035 | B: 194/1854 | C: 241/1807 [LOSS Ex1] A: 0.68152 | B: 0.68184 | C: 0.67793 [LOGITS Ex2 A] Mean Abs: 1.158 | Max: 4.655 [LOSS Ex2] A: 0.31664 | B: 0.43495 | C: 0.40025 ** [JOINT LOSS] ** : 1.064380 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004114 | Grad Max: 0.111391 -> Layer: shared_layers.0.bias | Grad Mean: 0.161005 | Grad Max: 0.884052 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001860 | Grad Max: 0.008737 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012415 | Grad Max: 0.012415 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001325 | Grad Max: 0.078351 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024305 | Grad Max: 0.439279 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000306 | Grad Max: 0.007152 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010994 | Grad Max: 0.036946 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000729 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002577 | Grad Max: 0.006172 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000372 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000932 | Grad Max: 0.002369 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002329 | Grad Max: 0.004955 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032440 | Grad Max: 0.032440 [GRADIENT NORM TOTAL] 3.2416 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.120 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54328895 0.45671108] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 19/2029 | B: 188/1860 | C: 236/1812 [LOSS Ex1] A: 0.68033 | B: 0.67976 | C: 0.67791 [LOGITS Ex2 A] Mean Abs: 1.138 | Max: 5.041 [LOSS Ex2] A: 0.32158 | B: 0.42311 | C: 0.42454 ** [JOINT LOSS] ** : 1.069078 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004463 | Grad Max: 0.119901 -> Layer: shared_layers.0.bias | Grad Mean: 0.256747 | Grad Max: 1.433094 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001904 | Grad Max: 0.008727 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002972 | Grad Max: 0.002972 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001984 | Grad Max: 0.198749 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037765 | Grad Max: 1.127033 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000487 | Grad Max: 0.012704 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017636 | Grad Max: 0.063235 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.001191 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004071 | Grad Max: 0.010471 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000492 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001476 | Grad Max: 0.003808 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003641 | Grad Max: 0.006708 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051086 | Grad Max: 0.051086 [GRADIENT NORM TOTAL] 5.2246 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.166 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5249537 0.47504625] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 13/1603 | B: 172/1684 | C: 240/1808 [LOSS Ex1] A: 0.67952 | B: 0.68180 | C: 0.67788 [LOGITS Ex2 A] Mean Abs: 1.193 | Max: 4.725 [LOSS Ex2] A: 0.29876 | B: 0.40459 | C: 0.40059 ** [JOINT LOSS] ** : 1.047715 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001480 | Grad Max: 0.029687 -> Layer: shared_layers.0.bias | Grad Mean: 0.083135 | Grad Max: 0.425735 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001886 | Grad Max: 0.009242 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009236 | Grad Max: 0.009236 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000670 | Grad Max: 0.058582 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012207 | Grad Max: 0.327037 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000138 | Grad Max: 0.004704 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004866 | Grad Max: 0.023616 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000427 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001115 | Grad Max: 0.003413 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000182 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000398 | Grad Max: 0.001282 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000997 | Grad Max: 0.003267 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013401 | Grad Max: 0.013401 [GRADIENT NORM TOTAL] 1.7874 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.292 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5086523 0.49134764] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 12/2036 | B: 179/1869 | C: 211/1837 [LOSS Ex1] A: 0.67985 | B: 0.68141 | C: 0.67875 [LOGITS Ex2 A] Mean Abs: 1.233 | Max: 5.114 [LOSS Ex2] A: 0.32523 | B: 0.43765 | C: 0.38441 ** [JOINT LOSS] ** : 1.062436 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005789 | Grad Max: 0.160915 -> Layer: shared_layers.0.bias | Grad Mean: 0.216411 | Grad Max: 1.109783 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001911 | Grad Max: 0.009284 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010952 | Grad Max: 0.010952 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001802 | Grad Max: 0.129340 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033611 | Grad Max: 0.693514 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.010283 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014529 | Grad Max: 0.054729 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000910 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003369 | Grad Max: 0.008053 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000456 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001215 | Grad Max: 0.003262 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002826 | Grad Max: 0.005963 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040375 | Grad Max: 0.040375 [GRADIENT NORM TOTAL] 4.2712 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.277 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50009024 0.49990976] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 15/2033 | B: 194/1854 | C: 250/1798 [LOSS Ex1] A: 0.67939 | B: 0.68175 | C: 0.67674 [LOGITS Ex2 A] Mean Abs: 1.218 | Max: 4.630 [LOSS Ex2] A: 0.32722 | B: 0.44416 | C: 0.40829 ** [JOINT LOSS] ** : 1.072521 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006542 | Grad Max: 0.162992 -> Layer: shared_layers.0.bias | Grad Mean: 0.295633 | Grad Max: 1.644493 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.010154 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017846 | Grad Max: 0.017846 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002386 | Grad Max: 0.150165 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045389 | Grad Max: 0.854643 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000564 | Grad Max: 0.012616 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020397 | Grad Max: 0.067647 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001258 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004698 | Grad Max: 0.011533 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000599 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001677 | Grad Max: 0.004309 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003835 | Grad Max: 0.007094 -> Layer: exit2_layers.12.bias | Grad Mean: 0.055248 | Grad Max: 0.055248 [GRADIENT NORM TOTAL] 5.9344 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.284 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50613165 0.49386832] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 16/2032 | B: 188/1860 | C: 233/1815 [LOSS Ex1] A: 0.67850 | B: 0.67966 | C: 0.67756 [LOGITS Ex2 A] Mean Abs: 1.200 | Max: 4.624 [LOSS Ex2] A: 0.33283 | B: 0.41465 | C: 0.39225 ** [JOINT LOSS] ** : 1.058483 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004573 | Grad Max: 0.134973 -> Layer: shared_layers.0.bias | Grad Mean: 0.152470 | Grad Max: 0.750317 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.009741 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010813 | Grad Max: 0.010813 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001277 | Grad Max: 0.104645 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023316 | Grad Max: 0.568391 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.006152 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010016 | Grad Max: 0.032272 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000634 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002341 | Grad Max: 0.005616 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000351 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000848 | Grad Max: 0.002383 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002114 | Grad Max: 0.004948 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029892 | Grad Max: 0.029892 [GRADIENT NORM TOTAL] 3.0159 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.246 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083878 0.4916122] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.514 | Std: 0.014 [MASKS] A(Pass/Fail): 15/2033 | B: 176/1680 | C: 258/1790 [LOSS Ex1] A: 0.68088 | B: 0.68171 | C: 0.67580 [LOGITS Ex2 A] Mean Abs: 1.139 | Max: 5.422 [LOSS Ex2] A: 0.32046 | B: 0.41752 | C: 0.39632 ** [JOINT LOSS] ** : 1.057559 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005147 | Grad Max: 0.125206 -> Layer: shared_layers.0.bias | Grad Mean: 0.246457 | Grad Max: 1.428818 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001819 | Grad Max: 0.007853 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001720 | Grad Max: 0.001720 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.125184 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037441 | Grad Max: 0.670159 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000475 | Grad Max: 0.011247 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017113 | Grad Max: 0.058921 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.001001 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003952 | Grad Max: 0.009507 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000478 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001410 | Grad Max: 0.003474 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003376 | Grad Max: 0.006877 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047413 | Grad Max: 0.047413 [GRADIENT NORM TOTAL] 4.9999 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.038 | Max: 0.172 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51112145 0.48887855] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.014 [MASKS] A(Pass/Fail): 15/2033 | B: 180/1868 | C: 253/1795 [LOSS Ex1] A: 0.68045 | B: 0.68132 | C: 0.67640 [LOGITS Ex2 A] Mean Abs: 1.105 | Max: 4.765 [LOSS Ex2] A: 0.33096 | B: 0.46255 | C: 0.40630 ** [JOINT LOSS] ** : 1.079325 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006605 | Grad Max: 0.161526 -> Layer: shared_layers.0.bias | Grad Mean: 0.319038 | Grad Max: 1.826216 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001845 | Grad Max: 0.008384 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001936 | Grad Max: 0.001936 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002555 | Grad Max: 0.155922 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048268 | Grad Max: 0.843119 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000613 | Grad Max: 0.014102 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022163 | Grad Max: 0.073370 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000106 | Grad Max: 0.001338 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005127 | Grad Max: 0.012222 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.000626 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001839 | Grad Max: 0.004639 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004281 | Grad Max: 0.008147 -> Layer: exit2_layers.12.bias | Grad Mean: 0.061593 | Grad Max: 0.061593 [GRADIENT NORM TOTAL] 6.4003 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.261 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54525185 0.45474818] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 17/2031 | B: 196/1852 | C: 271/1777 [LOSS Ex1] A: 0.67948 | B: 0.68167 | C: 0.67614 [LOGITS Ex2 A] Mean Abs: 1.187 | Max: 5.120 [LOSS Ex2] A: 0.30519 | B: 0.44000 | C: 0.38677 ** [JOINT LOSS] ** : 1.056419 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004228 | Grad Max: 0.102107 -> Layer: shared_layers.0.bias | Grad Mean: 0.191104 | Grad Max: 1.023562 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.010523 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019189 | Grad Max: 0.019189 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001526 | Grad Max: 0.079182 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028324 | Grad Max: 0.431318 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.007777 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012935 | Grad Max: 0.042606 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000883 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002995 | Grad Max: 0.007403 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000381 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001069 | Grad Max: 0.002826 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002604 | Grad Max: 0.005221 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035890 | Grad Max: 0.035890 [GRADIENT NORM TOTAL] 3.7315 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.294 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50439596 0.495604 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 13/2035 | B: 189/1859 | C: 253/1795 [LOSS Ex1] A: 0.68135 | B: 0.67957 | C: 0.67710 [LOGITS Ex2 A] Mean Abs: 1.185 | Max: 4.829 [LOSS Ex2] A: 0.32056 | B: 0.42588 | C: 0.39513 ** [JOINT LOSS] ** : 1.059864 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.056716 -> Layer: shared_layers.0.bias | Grad Mean: 0.130522 | Grad Max: 0.717344 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.008863 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009988 | Grad Max: 0.009988 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001017 | Grad Max: 0.066176 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018727 | Grad Max: 0.378650 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000235 | Grad Max: 0.005976 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008556 | Grad Max: 0.030969 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000567 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001921 | Grad Max: 0.004624 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000270 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000685 | Grad Max: 0.001870 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001429 | Grad Max: 0.003788 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022373 | Grad Max: 0.022373 [GRADIENT NORM TOTAL] 2.6753 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.122 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54406834 0.45593163] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 21/2027 | B: 180/1676 | C: 262/1786 [LOSS Ex1] A: 0.68015 | B: 0.68162 | C: 0.67599 [LOGITS Ex2 A] Mean Abs: 1.213 | Max: 4.855 [LOSS Ex2] A: 0.33022 | B: 0.42333 | C: 0.38936 ** [JOINT LOSS] ** : 1.060225 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003793 | Grad Max: 0.096862 -> Layer: shared_layers.0.bias | Grad Mean: 0.221442 | Grad Max: 1.237035 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001907 | Grad Max: 0.009234 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006673 | Grad Max: 0.006673 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001713 | Grad Max: 0.109065 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032465 | Grad Max: 0.606849 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000413 | Grad Max: 0.010739 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015048 | Grad Max: 0.057929 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000991 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003447 | Grad Max: 0.008527 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000437 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001231 | Grad Max: 0.003150 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002825 | Grad Max: 0.005428 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041424 | Grad Max: 0.041424 [GRADIENT NORM TOTAL] 4.4420 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.168 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52552354 0.4744764 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 14/1602 | B: 182/1866 | C: 243/1805 [LOSS Ex1] A: 0.67933 | B: 0.68124 | C: 0.67842 [LOGITS Ex2 A] Mean Abs: 1.227 | Max: 5.160 [LOSS Ex2] A: 0.29726 | B: 0.44495 | C: 0.39013 ** [JOINT LOSS] ** : 1.057107 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001610 | Grad Max: 0.041090 -> Layer: shared_layers.0.bias | Grad Mean: 0.084524 | Grad Max: 0.429748 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001854 | Grad Max: 0.008803 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003489 | Grad Max: 0.003489 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000645 | Grad Max: 0.053934 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011803 | Grad Max: 0.306701 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000143 | Grad Max: 0.004186 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005176 | Grad Max: 0.020619 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000364 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001166 | Grad Max: 0.003243 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000198 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000417 | Grad Max: 0.001246 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000760 | Grad Max: 0.002623 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013329 | Grad Max: 0.013329 [GRADIENT NORM TOTAL] 1.7064 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.294 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5086065 0.4913935] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 13/2035 | B: 199/1849 | C: 155/1221 [LOSS Ex1] A: 0.67965 | B: 0.68158 | C: 0.68001 [LOGITS Ex2 A] Mean Abs: 1.193 | Max: 6.454 [LOSS Ex2] A: 0.32004 | B: 0.42313 | C: 0.40047 ** [JOINT LOSS] ** : 1.061627 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003969 | Grad Max: 0.090749 -> Layer: shared_layers.0.bias | Grad Mean: 0.193055 | Grad Max: 1.015774 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001917 | Grad Max: 0.009151 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013301 | Grad Max: 0.013301 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001461 | Grad Max: 0.094780 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027796 | Grad Max: 0.521335 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000350 | Grad Max: 0.008721 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012750 | Grad Max: 0.048075 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000781 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002932 | Grad Max: 0.007177 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000427 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001036 | Grad Max: 0.002899 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002314 | Grad Max: 0.004892 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033960 | Grad Max: 0.033960 [GRADIENT NORM TOTAL] 3.7766 [EPOCH SUMMARY] Train Loss: 1.0621 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0382 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 30/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.280 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50008655 0.49991345] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 18/2030 | B: 192/1856 | C: 255/1793 [LOSS Ex1] A: 0.67917 | B: 0.67947 | C: 0.67691 [LOGITS Ex2 A] Mean Abs: 1.177 | Max: 5.122 [LOSS Ex2] A: 0.30692 | B: 0.41917 | C: 0.39634 ** [JOINT LOSS] ** : 1.052663 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004070 | Grad Max: 0.109714 -> Layer: shared_layers.0.bias | Grad Mean: 0.237020 | Grad Max: 1.288446 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.010258 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016537 | Grad Max: 0.016537 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001814 | Grad Max: 0.112255 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034404 | Grad Max: 0.646463 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.011124 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015730 | Grad Max: 0.060841 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000957 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003628 | Grad Max: 0.008484 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000464 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001293 | Grad Max: 0.003371 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003083 | Grad Max: 0.005950 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043872 | Grad Max: 0.043872 [GRADIENT NORM TOTAL] 4.6645 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.287 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5060296 0.4939704] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 18/2030 | B: 180/1676 | C: 225/1823 [LOSS Ex1] A: 0.67829 | B: 0.68153 | C: 0.67916 [LOGITS Ex2 A] Mean Abs: 1.171 | Max: 5.070 [LOSS Ex2] A: 0.33116 | B: 0.40426 | C: 0.38952 ** [JOINT LOSS] ** : 1.054638 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002809 | Grad Max: 0.071894 -> Layer: shared_layers.0.bias | Grad Mean: 0.090289 | Grad Max: 0.421175 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001984 | Grad Max: 0.010030 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014735 | Grad Max: 0.014735 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000748 | Grad Max: 0.057155 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012488 | Grad Max: 0.293412 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000128 | Grad Max: 0.005409 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004497 | Grad Max: 0.026935 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000359 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000978 | Grad Max: 0.003022 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000163 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000331 | Grad Max: 0.001095 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000717 | Grad Max: 0.002535 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010326 | Grad Max: 0.010326 [GRADIENT NORM TOTAL] 1.7572 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.247 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083104 0.49168956] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 15/2033 | B: 182/1866 | C: 264/1784 [LOSS Ex1] A: 0.68071 | B: 0.68114 | C: 0.67543 [LOGITS Ex2 A] Mean Abs: 1.198 | Max: 4.854 [LOSS Ex2] A: 0.32650 | B: 0.44545 | C: 0.38769 ** [JOINT LOSS] ** : 1.065639 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004567 | Grad Max: 0.105548 -> Layer: shared_layers.0.bias | Grad Mean: 0.208715 | Grad Max: 1.154316 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001843 | Grad Max: 0.007668 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000569 | Grad Max: 0.000569 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001642 | Grad Max: 0.099167 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030898 | Grad Max: 0.536921 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000385 | Grad Max: 0.009177 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013950 | Grad Max: 0.045666 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000795 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003214 | Grad Max: 0.007386 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000393 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001141 | Grad Max: 0.003024 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002455 | Grad Max: 0.005099 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037342 | Grad Max: 0.037342 [GRADIENT NORM TOTAL] 4.1197 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.173 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5114407 0.4885593] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.514 | Std: 0.014 [MASKS] A(Pass/Fail): 15/2033 | B: 202/1846 | C: 231/1817 [LOSS Ex1] A: 0.68028 | B: 0.68149 | C: 0.67720 [LOGITS Ex2 A] Mean Abs: 1.190 | Max: 4.756 [LOSS Ex2] A: 0.33469 | B: 0.44359 | C: 0.39194 ** [JOINT LOSS] ** : 1.069730 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005903 | Grad Max: 0.160076 -> Layer: shared_layers.0.bias | Grad Mean: 0.273634 | Grad Max: 1.499787 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001818 | Grad Max: 0.008386 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001382 | Grad Max: 0.001382 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.162097 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040228 | Grad Max: 0.893487 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000500 | Grad Max: 0.011689 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018070 | Grad Max: 0.063294 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.001160 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004152 | Grad Max: 0.010161 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000505 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001480 | Grad Max: 0.003780 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003399 | Grad Max: 0.006581 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048774 | Grad Max: 0.048774 [GRADIENT NORM TOTAL] 5.4433 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.263 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5462029 0.4537971] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.014 [MASKS] A(Pass/Fail): 18/2030 | B: 192/1856 | C: 263/1785 [LOSS Ex1] A: 0.67927 | B: 0.67937 | C: 0.67729 [LOGITS Ex2 A] Mean Abs: 1.210 | Max: 4.907 [LOSS Ex2] A: 0.30830 | B: 0.42318 | C: 0.39039 ** [JOINT LOSS] ** : 1.052601 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002902 | Grad Max: 0.071344 -> Layer: shared_layers.0.bias | Grad Mean: 0.107529 | Grad Max: 0.565970 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002000 | Grad Max: 0.009441 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011218 | Grad Max: 0.011218 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000851 | Grad Max: 0.088288 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015579 | Grad Max: 0.490292 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000191 | Grad Max: 0.005337 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006855 | Grad Max: 0.026738 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000500 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001580 | Grad Max: 0.004059 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000211 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000569 | Grad Max: 0.001572 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001314 | Grad Max: 0.003367 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019569 | Grad Max: 0.019569 [GRADIENT NORM TOTAL] 2.1627 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.296 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50429904 0.49570093] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 16/2032 | B: 182/1674 | C: 224/1824 [LOSS Ex1] A: 0.68117 | B: 0.68144 | C: 0.67896 [LOGITS Ex2 A] Mean Abs: 1.172 | Max: 4.721 [LOSS Ex2] A: 0.31243 | B: 0.41676 | C: 0.40118 ** [JOINT LOSS] ** : 1.057312 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005827 | Grad Max: 0.180607 -> Layer: shared_layers.0.bias | Grad Mean: 0.203659 | Grad Max: 1.107930 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001865 | Grad Max: 0.008779 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013936 | Grad Max: 0.013936 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001748 | Grad Max: 0.125365 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032161 | Grad Max: 0.669681 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000402 | Grad Max: 0.008864 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014432 | Grad Max: 0.048244 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000936 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003346 | Grad Max: 0.008266 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000437 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001182 | Grad Max: 0.003015 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002830 | Grad Max: 0.006201 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039159 | Grad Max: 0.039159 [GRADIENT NORM TOTAL] 4.1647 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.125 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5449559 0.4550441] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 24/2024 | B: 184/1864 | C: 243/1805 [LOSS Ex1] A: 0.67995 | B: 0.68105 | C: 0.67656 [LOGITS Ex2 A] Mean Abs: 1.168 | Max: 5.406 [LOSS Ex2] A: 0.33762 | B: 0.46314 | C: 0.41371 ** [JOINT LOSS] ** : 1.084008 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006979 | Grad Max: 0.172315 -> Layer: shared_layers.0.bias | Grad Mean: 0.306028 | Grad Max: 1.722298 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001885 | Grad Max: 0.008755 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004277 | Grad Max: 0.004277 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002522 | Grad Max: 0.143203 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047561 | Grad Max: 0.782215 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000603 | Grad Max: 0.014887 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021838 | Grad Max: 0.076066 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001163 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005042 | Grad Max: 0.010885 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000604 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001809 | Grad Max: 0.004571 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004366 | Grad Max: 0.008540 -> Layer: exit2_layers.12.bias | Grad Mean: 0.061752 | Grad Max: 0.061752 [GRADIENT NORM TOTAL] 6.1594 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.170 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5261418 0.47385818] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 18/1598 | B: 205/1843 | C: 251/1797 [LOSS Ex1] A: 0.67913 | B: 0.68140 | C: 0.67621 [LOGITS Ex2 A] Mean Abs: 1.212 | Max: 4.751 [LOSS Ex2] A: 0.30536 | B: 0.44588 | C: 0.39153 ** [JOINT LOSS] ** : 1.059838 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005667 | Grad Max: 0.140175 -> Layer: shared_layers.0.bias | Grad Mean: 0.198704 | Grad Max: 1.125647 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.008797 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000096 | Grad Max: 0.000096 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001658 | Grad Max: 0.136065 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030616 | Grad Max: 0.726014 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000380 | Grad Max: 0.008897 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013579 | Grad Max: 0.044573 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000850 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003149 | Grad Max: 0.007327 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000429 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001128 | Grad Max: 0.002914 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002879 | Grad Max: 0.005669 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039068 | Grad Max: 0.039068 [GRADIENT NORM TOTAL] 4.0025 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085377 0.49146232] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.015 [MASKS] A(Pass/Fail): 17/2031 | B: 194/1854 | C: 265/1783 [LOSS Ex1] A: 0.67945 | B: 0.67927 | C: 0.67580 [LOGITS Ex2 A] Mean Abs: 1.241 | Max: 6.011 [LOSS Ex2] A: 0.32534 | B: 0.42848 | C: 0.38935 ** [JOINT LOSS] ** : 1.059228 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003000 | Grad Max: 0.077885 -> Layer: shared_layers.0.bias | Grad Mean: 0.192528 | Grad Max: 1.077483 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.009814 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013851 | Grad Max: 0.013851 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001459 | Grad Max: 0.105889 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027570 | Grad Max: 0.587453 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000354 | Grad Max: 0.010376 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012985 | Grad Max: 0.055157 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000842 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002955 | Grad Max: 0.007262 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000378 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001049 | Grad Max: 0.002829 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002373 | Grad Max: 0.004702 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034933 | Grad Max: 0.034933 [GRADIENT NORM TOTAL] 3.9181 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.282 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002875 0.49971256] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.015 [MASKS] A(Pass/Fail): 21/2027 | B: 182/1674 | C: 233/1815 [LOSS Ex1] A: 0.67895 | B: 0.68134 | C: 0.67748 [LOGITS Ex2 A] Mean Abs: 1.236 | Max: 4.891 [LOSS Ex2] A: 0.33347 | B: 0.41211 | C: 0.39520 ** [JOINT LOSS] ** : 1.059519 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004121 | Grad Max: 0.098908 -> Layer: shared_layers.0.bias | Grad Mean: 0.215778 | Grad Max: 1.250906 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001997 | Grad Max: 0.009717 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014353 | Grad Max: 0.014353 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001698 | Grad Max: 0.098819 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032427 | Grad Max: 0.562722 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.010094 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014885 | Grad Max: 0.055311 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000910 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003400 | Grad Max: 0.008017 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000423 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001205 | Grad Max: 0.003203 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002722 | Grad Max: 0.005244 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040452 | Grad Max: 0.040452 [GRADIENT NORM TOTAL] 4.3785 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.289 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50588006 0.49411994] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.015 [MASKS] A(Pass/Fail): 22/2026 | B: 188/1860 | C: 261/1787 [LOSS Ex1] A: 0.67806 | B: 0.68096 | C: 0.67642 [LOGITS Ex2 A] Mean Abs: 1.226 | Max: 5.380 [LOSS Ex2] A: 0.32702 | B: 0.44294 | C: 0.39312 ** [JOINT LOSS] ** : 1.066174 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004024 | Grad Max: 0.127890 -> Layer: shared_layers.0.bias | Grad Mean: 0.144040 | Grad Max: 0.739506 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002023 | Grad Max: 0.009825 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011537 | Grad Max: 0.011537 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001202 | Grad Max: 0.074862 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022227 | Grad Max: 0.421676 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000280 | Grad Max: 0.006881 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010073 | Grad Max: 0.035614 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000773 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002325 | Grad Max: 0.005886 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000315 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000816 | Grad Max: 0.002152 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001697 | Grad Max: 0.003494 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026005 | Grad Max: 0.026005 [GRADIENT NORM TOTAL] 2.8615 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.249 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082236 0.49177638] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 17/2031 | B: 206/1842 | C: 228/1820 [LOSS Ex1] A: 0.68053 | B: 0.68131 | C: 0.67728 [LOGITS Ex2 A] Mean Abs: 1.172 | Max: 5.696 [LOSS Ex2] A: 0.30970 | B: 0.43841 | C: 0.39002 ** [JOINT LOSS] ** : 1.059088 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004266 | Grad Max: 0.097288 -> Layer: shared_layers.0.bias | Grad Mean: 0.208735 | Grad Max: 1.154611 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001773 | Grad Max: 0.007983 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003970 | Grad Max: 0.003970 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001637 | Grad Max: 0.097508 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030562 | Grad Max: 0.544967 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.009315 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014047 | Grad Max: 0.051589 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000807 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003228 | Grad Max: 0.007313 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000408 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001148 | Grad Max: 0.002993 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002833 | Grad Max: 0.005663 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039171 | Grad Max: 0.039171 [GRADIENT NORM TOTAL] 4.1371 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.039 | Max: 0.174 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51174 0.48826003] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 20/2028 | B: 194/1854 | C: 231/1817 [LOSS Ex1] A: 0.68010 | B: 0.67918 | C: 0.67915 [LOGITS Ex2 A] Mean Abs: 1.116 | Max: 5.223 [LOSS Ex2] A: 0.32483 | B: 0.42547 | C: 0.40596 ** [JOINT LOSS] ** : 1.064898 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005992 | Grad Max: 0.147702 -> Layer: shared_layers.0.bias | Grad Mean: 0.289235 | Grad Max: 1.571650 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001825 | Grad Max: 0.008702 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006874 | Grad Max: 0.006874 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.130858 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042202 | Grad Max: 0.714703 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000531 | Grad Max: 0.012292 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019330 | Grad Max: 0.066977 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001279 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004454 | Grad Max: 0.010896 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000570 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001580 | Grad Max: 0.003940 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003696 | Grad Max: 0.007465 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053255 | Grad Max: 0.053255 [GRADIENT NORM TOTAL] 5.6507 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.266 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5471337 0.4528663] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 21/2027 | B: 184/1672 | C: 172/1204 [LOSS Ex1] A: 0.67905 | B: 0.68126 | C: 0.67328 [LOGITS Ex2 A] Mean Abs: 1.194 | Max: 5.212 [LOSS Ex2] A: 0.29849 | B: 0.40373 | C: 0.39020 ** [JOINT LOSS] ** : 1.042005 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001913 | Grad Max: 0.073212 -> Layer: shared_layers.0.bias | Grad Mean: 0.153810 | Grad Max: 0.834651 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.009910 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011594 | Grad Max: 0.011594 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001123 | Grad Max: 0.071909 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020812 | Grad Max: 0.393184 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000263 | Grad Max: 0.006526 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009589 | Grad Max: 0.035256 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000684 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002184 | Grad Max: 0.005696 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000292 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000765 | Grad Max: 0.002130 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001750 | Grad Max: 0.004343 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024590 | Grad Max: 0.024590 [GRADIENT NORM TOTAL] 3.0486 [EPOCH SUMMARY] Train Loss: 1.0605 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0329 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0359 -> New: 1.0329) ############################## EPOCH 31/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.298 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041964 0.4958036] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.015 [MASKS] A(Pass/Fail): 18/2030 | B: 188/1860 | C: 222/1826 [LOSS Ex1] A: 0.68100 | B: 0.68087 | C: 0.67744 [LOGITS Ex2 A] Mean Abs: 1.225 | Max: 5.452 [LOSS Ex2] A: 0.29993 | B: 0.44767 | C: 0.38398 ** [JOINT LOSS] ** : 1.056964 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003417 | Grad Max: 0.085568 -> Layer: shared_layers.0.bias | Grad Mean: 0.195300 | Grad Max: 1.076839 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001932 | Grad Max: 0.009004 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013581 | Grad Max: 0.013581 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001494 | Grad Max: 0.093937 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028809 | Grad Max: 0.509675 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000364 | Grad Max: 0.008784 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013335 | Grad Max: 0.047281 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000745 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003022 | Grad Max: 0.006919 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000406 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001070 | Grad Max: 0.002876 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002263 | Grad Max: 0.004933 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035318 | Grad Max: 0.035318 [GRADIENT NORM TOTAL] 3.9071 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.127 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54576504 0.454235 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.015 [MASKS] A(Pass/Fail): 31/2017 | B: 208/1840 | C: 237/1811 [LOSS Ex1] A: 0.67977 | B: 0.68122 | C: 0.67744 [LOGITS Ex2 A] Mean Abs: 1.242 | Max: 4.904 [LOSS Ex2] A: 0.32321 | B: 0.43693 | C: 0.38471 ** [JOINT LOSS] ** : 1.061095 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004404 | Grad Max: 0.108093 -> Layer: shared_layers.0.bias | Grad Mean: 0.249441 | Grad Max: 1.371514 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001855 | Grad Max: 0.008909 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007726 | Grad Max: 0.007726 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001882 | Grad Max: 0.113334 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035967 | Grad Max: 0.636216 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000446 | Grad Max: 0.011239 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016512 | Grad Max: 0.063049 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.001016 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003785 | Grad Max: 0.009226 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000465 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001342 | Grad Max: 0.003486 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002978 | Grad Max: 0.005968 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044427 | Grad Max: 0.044427 [GRADIENT NORM TOTAL] 4.9256 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.173 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5266981 0.47330186] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 26/1590 | B: 195/1853 | C: 252/1796 [LOSS Ex1] A: 0.67894 | B: 0.67908 | C: 0.67738 [LOGITS Ex2 A] Mean Abs: 1.244 | Max: 5.273 [LOSS Ex2] A: 0.30477 | B: 0.42377 | C: 0.37031 ** [JOINT LOSS] ** : 1.044746 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002721 | Grad Max: 0.059979 -> Layer: shared_layers.0.bias | Grad Mean: 0.114844 | Grad Max: 0.637439 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001965 | Grad Max: 0.009298 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007963 | Grad Max: 0.007963 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000922 | Grad Max: 0.059265 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017516 | Grad Max: 0.337286 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.005656 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008085 | Grad Max: 0.031014 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000532 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001831 | Grad Max: 0.004603 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000275 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000653 | Grad Max: 0.001983 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001316 | Grad Max: 0.003955 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021182 | Grad Max: 0.021182 [GRADIENT NORM TOTAL] 2.3309 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.299 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084773 0.49152276] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 27/2021 | B: 184/1672 | C: 252/1796 [LOSS Ex1] A: 0.67925 | B: 0.68117 | C: 0.67642 [LOGITS Ex2 A] Mean Abs: 1.207 | Max: 6.359 [LOSS Ex2] A: 0.30040 | B: 0.41519 | C: 0.40455 ** [JOINT LOSS] ** : 1.052326 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004127 | Grad Max: 0.092690 -> Layer: shared_layers.0.bias | Grad Mean: 0.192684 | Grad Max: 1.036279 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001986 | Grad Max: 0.009758 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014880 | Grad Max: 0.014880 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001523 | Grad Max: 0.174107 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028635 | Grad Max: 0.980445 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.008239 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012996 | Grad Max: 0.045051 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000754 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002992 | Grad Max: 0.007087 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000375 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001057 | Grad Max: 0.002839 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002441 | Grad Max: 0.004897 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034874 | Grad Max: 0.034874 [GRADIENT NORM TOTAL] 3.9210 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.284 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50049126 0.49950877] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.015 [MASKS] A(Pass/Fail): 31/2017 | B: 189/1859 | C: 233/1815 [LOSS Ex1] A: 0.67874 | B: 0.68078 | C: 0.67615 [LOGITS Ex2 A] Mean Abs: 1.188 | Max: 5.751 [LOSS Ex2] A: 0.31386 | B: 0.45386 | C: 0.41499 ** [JOINT LOSS] ** : 1.072794 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006006 | Grad Max: 0.147200 -> Layer: shared_layers.0.bias | Grad Mean: 0.328629 | Grad Max: 1.802913 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.009507 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008498 | Grad Max: 0.008498 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002574 | Grad Max: 0.233592 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048765 | Grad Max: 1.309110 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.015260 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022340 | Grad Max: 0.081715 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000103 | Grad Max: 0.001241 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005128 | Grad Max: 0.011698 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000601 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001825 | Grad Max: 0.004670 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004233 | Grad Max: 0.007880 -> Layer: exit2_layers.12.bias | Grad Mean: 0.061297 | Grad Max: 0.061297 [GRADIENT NORM TOTAL] 6.6731 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.291 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5057465 0.49425352] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.015 [MASKS] A(Pass/Fail): 32/2016 | B: 210/1838 | C: 241/1807 [LOSS Ex1] A: 0.67785 | B: 0.68114 | C: 0.67709 [LOGITS Ex2 A] Mean Abs: 1.193 | Max: 5.001 [LOSS Ex2] A: 0.30825 | B: 0.43318 | C: 0.42117 ** [JOINT LOSS] ** : 1.066221 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003390 | Grad Max: 0.110160 -> Layer: shared_layers.0.bias | Grad Mean: 0.200708 | Grad Max: 1.139364 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001954 | Grad Max: 0.009092 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004962 | Grad Max: 0.004962 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001539 | Grad Max: 0.099331 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029211 | Grad Max: 0.561482 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000368 | Grad Max: 0.009440 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013599 | Grad Max: 0.051670 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000728 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003126 | Grad Max: 0.007160 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000391 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001121 | Grad Max: 0.002885 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002767 | Grad Max: 0.005193 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039834 | Grad Max: 0.039834 [GRADIENT NORM TOTAL] 3.9903 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.251 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081375 0.49186242] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.015 [MASKS] A(Pass/Fail): 34/2014 | B: 196/1852 | C: 239/1809 [LOSS Ex1] A: 0.68037 | B: 0.67899 | C: 0.67719 [LOGITS Ex2 A] Mean Abs: 1.215 | Max: 4.746 [LOSS Ex2] A: 0.32435 | B: 0.41596 | C: 0.41264 ** [JOINT LOSS] ** : 1.063167 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003271 | Grad Max: 0.087570 -> Layer: shared_layers.0.bias | Grad Mean: 0.152281 | Grad Max: 0.857749 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001825 | Grad Max: 0.007621 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000495 | Grad Max: 0.000495 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001158 | Grad Max: 0.075780 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021832 | Grad Max: 0.426813 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.006294 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010056 | Grad Max: 0.033897 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000635 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002300 | Grad Max: 0.005646 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000290 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000811 | Grad Max: 0.002145 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001695 | Grad Max: 0.003531 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025831 | Grad Max: 0.025831 [GRADIENT NORM TOTAL] 2.9726 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.175 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5120906 0.48790935] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.014 [MASKS] A(Pass/Fail): 27/2021 | B: 184/1672 | C: 263/1785 [LOSS Ex1] A: 0.67994 | B: 0.68108 | C: 0.67564 [LOGITS Ex2 A] Mean Abs: 1.198 | Max: 5.338 [LOSS Ex2] A: 0.31069 | B: 0.41908 | C: 0.39522 ** [JOINT LOSS] ** : 1.053879 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004385 | Grad Max: 0.109784 -> Layer: shared_layers.0.bias | Grad Mean: 0.196777 | Grad Max: 1.152315 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001830 | Grad Max: 0.008190 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000094 | Grad Max: 0.000094 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001556 | Grad Max: 0.103626 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029252 | Grad Max: 0.588486 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000363 | Grad Max: 0.008831 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013313 | Grad Max: 0.051239 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000777 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003064 | Grad Max: 0.006726 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000386 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001087 | Grad Max: 0.002873 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002373 | Grad Max: 0.005149 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036299 | Grad Max: 0.036299 [GRADIENT NORM TOTAL] 3.9388 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.268 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5480591 0.45194086] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 32/2016 | B: 189/1859 | C: 268/1780 [LOSS Ex1] A: 0.67885 | B: 0.68069 | C: 0.67534 [LOGITS Ex2 A] Mean Abs: 1.209 | Max: 5.100 [LOSS Ex2] A: 0.30081 | B: 0.43593 | C: 0.37239 ** [JOINT LOSS] ** : 1.048002 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002261 | Grad Max: 0.055518 -> Layer: shared_layers.0.bias | Grad Mean: 0.094466 | Grad Max: 0.539727 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002053 | Grad Max: 0.009704 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014012 | Grad Max: 0.014012 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000751 | Grad Max: 0.048266 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013692 | Grad Max: 0.263290 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.004806 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006181 | Grad Max: 0.023445 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000446 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001425 | Grad Max: 0.003678 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000204 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000507 | Grad Max: 0.001412 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000967 | Grad Max: 0.003160 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016071 | Grad Max: 0.016071 [GRADIENT NORM TOTAL] 1.8977 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.301 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50410914 0.49589086] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 33/2015 | B: 213/1835 | C: 256/1792 [LOSS Ex1] A: 0.68083 | B: 0.68105 | C: 0.67639 [LOGITS Ex2 A] Mean Abs: 1.184 | Max: 4.762 [LOSS Ex2] A: 0.29956 | B: 0.44006 | C: 0.40326 ** [JOINT LOSS] ** : 1.060383 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006157 | Grad Max: 0.158452 -> Layer: shared_layers.0.bias | Grad Mean: 0.248653 | Grad Max: 1.309926 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001845 | Grad Max: 0.008583 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010495 | Grad Max: 0.010495 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002005 | Grad Max: 0.114558 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037189 | Grad Max: 0.632352 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.009385 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016570 | Grad Max: 0.054660 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001026 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003827 | Grad Max: 0.009208 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000451 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001355 | Grad Max: 0.003450 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003206 | Grad Max: 0.005978 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045624 | Grad Max: 0.045624 [GRADIENT NORM TOTAL] 4.8789 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.129 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54660416 0.4533958 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 45/2003 | B: 197/1851 | C: 228/1820 [LOSS Ex1] A: 0.67958 | B: 0.67889 | C: 0.67718 [LOGITS Ex2 A] Mean Abs: 1.180 | Max: 4.804 [LOSS Ex2] A: 0.31782 | B: 0.42881 | C: 0.40077 ** [JOINT LOSS] ** : 1.061014 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007096 | Grad Max: 0.173807 -> Layer: shared_layers.0.bias | Grad Mean: 0.321708 | Grad Max: 1.734709 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001943 | Grad Max: 0.008876 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007685 | Grad Max: 0.007685 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002588 | Grad Max: 0.145691 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048846 | Grad Max: 0.785763 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.014008 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022226 | Grad Max: 0.074157 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000103 | Grad Max: 0.001315 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005110 | Grad Max: 0.011972 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000574 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001807 | Grad Max: 0.004615 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004194 | Grad Max: 0.007872 -> Layer: exit2_layers.12.bias | Grad Mean: 0.060538 | Grad Max: 0.060538 [GRADIENT NORM TOTAL] 6.3849 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.175 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5272953 0.47270474] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 33/1583 | B: 188/1668 | C: 250/1798 [LOSS Ex1] A: 0.67875 | B: 0.68098 | C: 0.67635 [LOGITS Ex2 A] Mean Abs: 1.224 | Max: 5.024 [LOSS Ex2] A: 0.29011 | B: 0.40574 | C: 0.38036 ** [JOINT LOSS] ** : 1.037429 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004889 | Grad Max: 0.134210 -> Layer: shared_layers.0.bias | Grad Mean: 0.239265 | Grad Max: 1.259770 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001915 | Grad Max: 0.009224 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005145 | Grad Max: 0.005145 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001859 | Grad Max: 0.110956 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034861 | Grad Max: 0.612765 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000436 | Grad Max: 0.009723 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016006 | Grad Max: 0.055441 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000970 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003675 | Grad Max: 0.008582 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000427 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001282 | Grad Max: 0.003244 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002885 | Grad Max: 0.006434 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041493 | Grad Max: 0.041493 [GRADIENT NORM TOTAL] 4.7400 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.302 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084324 0.49156755] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 37/2011 | B: 191/1857 | C: 235/1813 [LOSS Ex1] A: 0.67906 | B: 0.68060 | C: 0.67664 [LOGITS Ex2 A] Mean Abs: 1.245 | Max: 5.556 [LOSS Ex2] A: 0.29793 | B: 0.44208 | C: 0.39011 ** [JOINT LOSS] ** : 1.055476 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003245 | Grad Max: 0.073972 -> Layer: shared_layers.0.bias | Grad Mean: 0.138158 | Grad Max: 0.683315 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001954 | Grad Max: 0.009225 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008951 | Grad Max: 0.008951 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001074 | Grad Max: 0.092359 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020124 | Grad Max: 0.525287 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000251 | Grad Max: 0.007029 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009104 | Grad Max: 0.037173 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000541 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002078 | Grad Max: 0.004893 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000275 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000744 | Grad Max: 0.002055 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001707 | Grad Max: 0.004199 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025788 | Grad Max: 0.025788 [GRADIENT NORM TOTAL] 2.7190 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.286 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50070375 0.49929625] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 42/2006 | B: 216/1832 | C: 178/1198 [LOSS Ex1] A: 0.67853 | B: 0.68096 | C: 0.67719 [LOGITS Ex2 A] Mean Abs: 1.249 | Max: 5.199 [LOSS Ex2] A: 0.32893 | B: 0.43407 | C: 0.39717 ** [JOINT LOSS] ** : 1.065615 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005185 | Grad Max: 0.113163 -> Layer: shared_layers.0.bias | Grad Mean: 0.276575 | Grad Max: 1.454878 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.010487 -> Layer: exit1_layers.0.bias | Grad Mean: 0.021267 | Grad Max: 0.021267 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.147824 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039941 | Grad Max: 0.850868 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000500 | Grad Max: 0.011268 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018366 | Grad Max: 0.061357 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.001072 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004195 | Grad Max: 0.009881 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000467 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001471 | Grad Max: 0.003739 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003119 | Grad Max: 0.005815 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047270 | Grad Max: 0.047270 [GRADIENT NORM TOTAL] 5.4845 [EPOCH SUMMARY] Train Loss: 1.0571 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0270 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0329 -> New: 1.0270) ############################## EPOCH 32/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.294 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056307 0.49436936] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 39/2009 | B: 199/1849 | C: 248/1800 [LOSS Ex1] A: 0.67764 | B: 0.67879 | C: 0.67650 [LOGITS Ex2 A] Mean Abs: 1.231 | Max: 4.799 [LOSS Ex2] A: 0.32637 | B: 0.42379 | C: 0.37957 ** [JOINT LOSS] ** : 1.054220 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004254 | Grad Max: 0.133847 -> Layer: shared_layers.0.bias | Grad Mean: 0.138790 | Grad Max: 0.672072 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.010196 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014212 | Grad Max: 0.014212 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001208 | Grad Max: 0.076862 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022074 | Grad Max: 0.402982 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000270 | Grad Max: 0.006714 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009698 | Grad Max: 0.036333 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000602 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002238 | Grad Max: 0.005728 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000289 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000795 | Grad Max: 0.002141 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001797 | Grad Max: 0.003917 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026501 | Grad Max: 0.026501 [GRADIENT NORM TOTAL] 2.8257 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.253 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080842 0.49191582] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.015 [MASKS] A(Pass/Fail): 47/2001 | B: 188/1668 | C: 236/1812 [LOSS Ex1] A: 0.68021 | B: 0.68089 | C: 0.67757 [LOGITS Ex2 A] Mean Abs: 1.181 | Max: 5.267 [LOSS Ex2] A: 0.29952 | B: 0.41072 | C: 0.41326 ** [JOINT LOSS] ** : 1.054057 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004819 | Grad Max: 0.131335 -> Layer: shared_layers.0.bias | Grad Mean: 0.217381 | Grad Max: 1.158885 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001810 | Grad Max: 0.008320 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007502 | Grad Max: 0.007502 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001693 | Grad Max: 0.120237 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031789 | Grad Max: 0.649333 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000394 | Grad Max: 0.009501 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014372 | Grad Max: 0.051157 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000957 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003300 | Grad Max: 0.008686 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000407 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001162 | Grad Max: 0.002960 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002755 | Grad Max: 0.005121 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039124 | Grad Max: 0.039124 [GRADIENT NORM TOTAL] 4.3293 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.040 | Max: 0.177 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5124284 0.48757157] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.015 [MASKS] A(Pass/Fail): 34/2014 | B: 193/1855 | C: 220/1828 [LOSS Ex1] A: 0.67977 | B: 0.68051 | C: 0.67855 [LOGITS Ex2 A] Mean Abs: 1.141 | Max: 5.659 [LOSS Ex2] A: 0.31375 | B: 0.45825 | C: 0.42091 ** [JOINT LOSS] ** : 1.077243 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005994 | Grad Max: 0.147992 -> Layer: shared_layers.0.bias | Grad Mean: 0.327635 | Grad Max: 1.781703 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001790 | Grad Max: 0.008375 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004725 | Grad Max: 0.004725 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002499 | Grad Max: 0.179392 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047421 | Grad Max: 1.016468 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000597 | Grad Max: 0.014451 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021883 | Grad Max: 0.078354 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001240 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005014 | Grad Max: 0.011019 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000564 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001775 | Grad Max: 0.004333 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004038 | Grad Max: 0.007761 -> Layer: exit2_layers.12.bias | Grad Mean: 0.058568 | Grad Max: 0.058568 [GRADIENT NORM TOTAL] 6.5328 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.270 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54896295 0.45103708] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 51/1997 | B: 217/1831 | C: 262/1786 [LOSS Ex1] A: 0.67865 | B: 0.68087 | C: 0.67621 [LOGITS Ex2 A] Mean Abs: 1.200 | Max: 5.252 [LOSS Ex2] A: 0.29764 | B: 0.43297 | C: 0.39840 ** [JOINT LOSS] ** : 1.054915 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003763 | Grad Max: 0.088303 -> Layer: shared_layers.0.bias | Grad Mean: 0.171571 | Grad Max: 0.958135 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.009878 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015562 | Grad Max: 0.015562 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001367 | Grad Max: 0.113305 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025285 | Grad Max: 0.637056 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.007155 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011443 | Grad Max: 0.040785 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000695 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002637 | Grad Max: 0.006385 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000367 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000927 | Grad Max: 0.002585 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002163 | Grad Max: 0.004804 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030935 | Grad Max: 0.030935 [GRADIENT NORM TOTAL] 3.4528 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.303 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040041 0.49599588] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 45/2003 | B: 200/1848 | C: 244/1804 [LOSS Ex1] A: 0.68067 | B: 0.67870 | C: 0.67649 [LOGITS Ex2 A] Mean Abs: 1.232 | Max: 4.607 [LOSS Ex2] A: 0.30106 | B: 0.42358 | C: 0.38812 ** [JOINT LOSS] ** : 1.049535 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002584 | Grad Max: 0.063042 -> Layer: shared_layers.0.bias | Grad Mean: 0.167004 | Grad Max: 0.914407 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001912 | Grad Max: 0.008705 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009101 | Grad Max: 0.009101 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001262 | Grad Max: 0.092681 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024275 | Grad Max: 0.516190 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.008294 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011226 | Grad Max: 0.042513 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000738 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002552 | Grad Max: 0.006519 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000311 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000902 | Grad Max: 0.002335 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001893 | Grad Max: 0.004308 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029557 | Grad Max: 0.029557 [GRADIENT NORM TOTAL] 3.4145 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.131 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54739887 0.4526012 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 57/1991 | B: 189/1667 | C: 223/1825 [LOSS Ex1] A: 0.67941 | B: 0.68080 | C: 0.67766 [LOGITS Ex2 A] Mean Abs: 1.258 | Max: 4.911 [LOSS Ex2] A: 0.32472 | B: 0.41985 | C: 0.39012 ** [JOINT LOSS] ** : 1.057517 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005848 | Grad Max: 0.135018 -> Layer: shared_layers.0.bias | Grad Mean: 0.280132 | Grad Max: 1.541836 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001861 | Grad Max: 0.008917 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009173 | Grad Max: 0.009173 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.130846 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040999 | Grad Max: 0.727698 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000508 | Grad Max: 0.012543 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018685 | Grad Max: 0.065764 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001041 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004265 | Grad Max: 0.009522 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000523 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001496 | Grad Max: 0.003890 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003262 | Grad Max: 0.006314 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048734 | Grad Max: 0.048734 [GRADIENT NORM TOTAL] 5.5606 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.178 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52785987 0.47214013] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 44/1572 | B: 193/1855 | C: 253/1795 [LOSS Ex1] A: 0.67857 | B: 0.68042 | C: 0.67516 [LOGITS Ex2 A] Mean Abs: 1.276 | Max: 5.641 [LOSS Ex2] A: 0.28804 | B: 0.43872 | C: 0.37796 ** [JOINT LOSS] ** : 1.046291 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002466 | Grad Max: 0.064252 -> Layer: shared_layers.0.bias | Grad Mean: 0.115404 | Grad Max: 0.608130 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001947 | Grad Max: 0.008439 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000672 | Grad Max: 0.000672 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000892 | Grad Max: 0.052569 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016657 | Grad Max: 0.294588 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.004815 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007526 | Grad Max: 0.026467 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000506 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001719 | Grad Max: 0.004588 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000250 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000607 | Grad Max: 0.001850 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001162 | Grad Max: 0.003135 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019262 | Grad Max: 0.019262 [GRADIENT NORM TOTAL] 2.2708 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.304 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083601 0.4916399] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 53/1995 | B: 219/1829 | C: 251/1797 [LOSS Ex1] A: 0.67888 | B: 0.68078 | C: 0.67703 [LOGITS Ex2 A] Mean Abs: 1.223 | Max: 5.545 [LOSS Ex2] A: 0.30885 | B: 0.43240 | C: 0.38462 ** [JOINT LOSS] ** : 1.054186 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003776 | Grad Max: 0.086451 -> Layer: shared_layers.0.bias | Grad Mean: 0.181745 | Grad Max: 0.982500 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001946 | Grad Max: 0.009336 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014138 | Grad Max: 0.014138 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001422 | Grad Max: 0.084964 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026804 | Grad Max: 0.487424 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.008445 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012240 | Grad Max: 0.045138 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000794 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002801 | Grad Max: 0.006673 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000352 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000978 | Grad Max: 0.002518 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002215 | Grad Max: 0.004526 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032520 | Grad Max: 0.032520 [GRADIENT NORM TOTAL] 3.6161 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.288 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50092363 0.49907637] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 57/1991 | B: 203/1845 | C: 249/1799 [LOSS Ex1] A: 0.67833 | B: 0.67860 | C: 0.67487 [LOGITS Ex2 A] Mean Abs: 1.206 | Max: 5.177 [LOSS Ex2] A: 0.30395 | B: 0.42996 | C: 0.40267 ** [JOINT LOSS] ** : 1.056128 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005117 | Grad Max: 0.128874 -> Layer: shared_layers.0.bias | Grad Mean: 0.268606 | Grad Max: 1.429977 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.009081 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004166 | Grad Max: 0.004166 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.124443 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039494 | Grad Max: 0.712641 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000499 | Grad Max: 0.011612 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018357 | Grad Max: 0.063062 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000984 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004193 | Grad Max: 0.008973 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000518 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001478 | Grad Max: 0.003913 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003409 | Grad Max: 0.006406 -> Layer: exit2_layers.12.bias | Grad Mean: 0.049874 | Grad Max: 0.049874 [GRADIENT NORM TOTAL] 5.3279 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.296 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5054748 0.49452516] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 54/1994 | B: 190/1666 | C: 247/1801 [LOSS Ex1] A: 0.67744 | B: 0.68071 | C: 0.67687 [LOGITS Ex2 A] Mean Abs: 1.213 | Max: 4.758 [LOSS Ex2] A: 0.30593 | B: 0.41218 | C: 0.38751 ** [JOINT LOSS] ** : 1.046885 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001777 | Grad Max: 0.066097 -> Layer: shared_layers.0.bias | Grad Mean: 0.136360 | Grad Max: 0.737066 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001938 | Grad Max: 0.009407 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007350 | Grad Max: 0.007350 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001024 | Grad Max: 0.069543 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019017 | Grad Max: 0.385829 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000242 | Grad Max: 0.007040 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008896 | Grad Max: 0.036149 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000577 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002008 | Grad Max: 0.005121 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000270 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000699 | Grad Max: 0.001985 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001560 | Grad Max: 0.003662 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023276 | Grad Max: 0.023276 [GRADIENT NORM TOTAL] 2.7866 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.255 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50798726 0.4920127 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.515 | Std: 0.015 [MASKS] A(Pass/Fail): 53/1995 | B: 194/1854 | C: 269/1779 [LOSS Ex1] A: 0.68006 | B: 0.68033 | C: 0.67453 [LOGITS Ex2 A] Mean Abs: 1.232 | Max: 4.973 [LOSS Ex2] A: 0.31151 | B: 0.44287 | C: 0.38155 ** [JOINT LOSS] ** : 1.056948 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004511 | Grad Max: 0.104847 -> Layer: shared_layers.0.bias | Grad Mean: 0.220027 | Grad Max: 1.135377 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001870 | Grad Max: 0.008255 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004976 | Grad Max: 0.004976 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001670 | Grad Max: 0.130665 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031685 | Grad Max: 0.724810 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000393 | Grad Max: 0.009896 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014472 | Grad Max: 0.051796 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000843 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003297 | Grad Max: 0.008078 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000419 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001153 | Grad Max: 0.003157 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002315 | Grad Max: 0.005059 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037152 | Grad Max: 0.037152 [GRADIENT NORM TOTAL] 4.3321 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.178 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5127431 0.48725685] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.515 | Std: 0.015 [MASKS] A(Pass/Fail): 44/2004 | B: 219/1829 | C: 248/1800 [LOSS Ex1] A: 0.67961 | B: 0.68070 | C: 0.67476 [LOGITS Ex2 A] Mean Abs: 1.222 | Max: 5.107 [LOSS Ex2] A: 0.30954 | B: 0.43821 | C: 0.36875 ** [JOINT LOSS] ** : 1.050525 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006261 | Grad Max: 0.166374 -> Layer: shared_layers.0.bias | Grad Mean: 0.303253 | Grad Max: 1.609333 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001881 | Grad Max: 0.008107 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000737 | Grad Max: 0.000737 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002290 | Grad Max: 0.173676 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043460 | Grad Max: 0.959827 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.011908 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019728 | Grad Max: 0.065602 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001067 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004518 | Grad Max: 0.010338 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000045 | Grad Max: 0.000518 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001591 | Grad Max: 0.004181 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003422 | Grad Max: 0.007396 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051882 | Grad Max: 0.051882 [GRADIENT NORM TOTAL] 5.9436 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.273 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5498703 0.45012966] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.016 [MASKS] A(Pass/Fail): 69/1979 | B: 205/1843 | C: 254/1794 [LOSS Ex1] A: 0.67845 | B: 0.67851 | C: 0.67611 [LOGITS Ex2 A] Mean Abs: 1.247 | Max: 5.032 [LOSS Ex2] A: 0.29802 | B: 0.41839 | C: 0.36690 ** [JOINT LOSS] ** : 1.038789 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004246 | Grad Max: 0.108390 -> Layer: shared_layers.0.bias | Grad Mean: 0.197219 | Grad Max: 1.045097 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.009911 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014772 | Grad Max: 0.014772 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001498 | Grad Max: 0.106560 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028120 | Grad Max: 0.586394 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000338 | Grad Max: 0.008426 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012389 | Grad Max: 0.044961 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000762 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002840 | Grad Max: 0.007175 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000351 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001009 | Grad Max: 0.002791 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002183 | Grad Max: 0.005408 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033742 | Grad Max: 0.033742 [GRADIENT NORM TOTAL] 3.8652 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.305 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50391847 0.4960815 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 68/1980 | B: 191/1665 | C: 165/1211 [LOSS Ex1] A: 0.68050 | B: 0.68062 | C: 0.67673 [LOGITS Ex2 A] Mean Abs: 1.219 | Max: 5.037 [LOSS Ex2] A: 0.29089 | B: 0.41239 | C: 0.35073 ** [JOINT LOSS] ** : 1.030624 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004337 | Grad Max: 0.141622 -> Layer: shared_layers.0.bias | Grad Mean: 0.135562 | Grad Max: 0.638980 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001892 | Grad Max: 0.008707 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012493 | Grad Max: 0.012493 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001135 | Grad Max: 0.073726 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020393 | Grad Max: 0.391761 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000241 | Grad Max: 0.005883 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008709 | Grad Max: 0.029622 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000659 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002037 | Grad Max: 0.005427 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000293 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000702 | Grad Max: 0.002070 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001676 | Grad Max: 0.004671 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022305 | Grad Max: 0.022305 [GRADIENT NORM TOTAL] 2.6829 [EPOCH SUMMARY] Train Loss: 1.0520 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0298 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 33/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.133 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54816884 0.45183116] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 80/1968 | B: 195/1853 | C: 246/1802 [LOSS Ex1] A: 0.67923 | B: 0.68024 | C: 0.67588 [LOGITS Ex2 A] Mean Abs: 1.202 | Max: 5.613 [LOSS Ex2] A: 0.32046 | B: 0.43774 | C: 0.39544 ** [JOINT LOSS] ** : 1.062999 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005552 | Grad Max: 0.130305 -> Layer: shared_layers.0.bias | Grad Mean: 0.244885 | Grad Max: 1.295592 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.008994 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009318 | Grad Max: 0.009318 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001926 | Grad Max: 0.128464 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036083 | Grad Max: 0.724726 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000441 | Grad Max: 0.009518 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016181 | Grad Max: 0.057397 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000920 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003717 | Grad Max: 0.008377 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000427 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001311 | Grad Max: 0.003322 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003126 | Grad Max: 0.006227 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044338 | Grad Max: 0.044338 [GRADIENT NORM TOTAL] 4.8468 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.180 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52836955 0.47163045] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 60/1556 | B: 220/1828 | C: 253/1795 [LOSS Ex1] A: 0.67839 | B: 0.68061 | C: 0.67621 [LOGITS Ex2 A] Mean Abs: 1.249 | Max: 6.098 [LOSS Ex2] A: 0.29669 | B: 0.42841 | C: 0.36481 ** [JOINT LOSS] ** : 1.041704 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002745 | Grad Max: 0.071323 -> Layer: shared_layers.0.bias | Grad Mean: 0.103243 | Grad Max: 0.523793 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.009401 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008860 | Grad Max: 0.008860 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000769 | Grad Max: 0.056745 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014154 | Grad Max: 0.316267 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000164 | Grad Max: 0.004441 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006008 | Grad Max: 0.022298 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000428 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001391 | Grad Max: 0.003584 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000207 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000488 | Grad Max: 0.001560 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001280 | Grad Max: 0.003554 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016847 | Grad Max: 0.016847 [GRADIENT NORM TOTAL] 1.9961 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.306 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50831884 0.4916812 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 72/1976 | B: 208/1840 | C: 272/1776 [LOSS Ex1] A: 0.67869 | B: 0.67841 | C: 0.67474 [LOGITS Ex2 A] Mean Abs: 1.271 | Max: 6.144 [LOSS Ex2] A: 0.30764 | B: 0.42526 | C: 0.37084 ** [JOINT LOSS] ** : 1.045197 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003992 | Grad Max: 0.100650 -> Layer: shared_layers.0.bias | Grad Mean: 0.226475 | Grad Max: 1.243527 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002046 | Grad Max: 0.009223 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007982 | Grad Max: 0.007982 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001744 | Grad Max: 0.105278 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033224 | Grad Max: 0.605087 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.010091 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015463 | Grad Max: 0.056643 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000882 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003524 | Grad Max: 0.007848 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000429 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001242 | Grad Max: 0.003231 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002643 | Grad Max: 0.005376 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041191 | Grad Max: 0.041191 [GRADIENT NORM TOTAL] 4.5598 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.291 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50108284 0.4989171 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 76/1972 | B: 192/1664 | C: 255/1793 [LOSS Ex1] A: 0.67813 | B: 0.68053 | C: 0.67486 [LOGITS Ex2 A] Mean Abs: 1.286 | Max: 5.045 [LOSS Ex2] A: 0.32379 | B: 0.42210 | C: 0.38527 ** [JOINT LOSS] ** : 1.054898 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006616 | Grad Max: 0.172676 -> Layer: shared_layers.0.bias | Grad Mean: 0.314607 | Grad Max: 1.721980 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002035 | Grad Max: 0.009746 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011769 | Grad Max: 0.011769 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002433 | Grad Max: 0.154182 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046256 | Grad Max: 0.836916 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000582 | Grad Max: 0.015833 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021440 | Grad Max: 0.081567 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001210 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004879 | Grad Max: 0.011169 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000575 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001714 | Grad Max: 0.004432 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003889 | Grad Max: 0.007024 -> Layer: exit2_layers.12.bias | Grad Mean: 0.056978 | Grad Max: 0.056978 [GRADIENT NORM TOTAL] 6.2041 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.298 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50535256 0.49464747] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 81/1967 | B: 195/1853 | C: 222/1826 [LOSS Ex1] A: 0.67724 | B: 0.68015 | C: 0.67556 [LOGITS Ex2 A] Mean Abs: 1.261 | Max: 5.330 [LOSS Ex2] A: 0.31586 | B: 0.44300 | C: 0.37484 ** [JOINT LOSS] ** : 1.055552 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004747 | Grad Max: 0.147704 -> Layer: shared_layers.0.bias | Grad Mean: 0.148851 | Grad Max: 0.754299 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.009086 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003902 | Grad Max: 0.003902 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001262 | Grad Max: 0.069373 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022916 | Grad Max: 0.373530 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000276 | Grad Max: 0.006831 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009929 | Grad Max: 0.033714 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000673 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002298 | Grad Max: 0.005842 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000308 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000811 | Grad Max: 0.002270 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001833 | Grad Max: 0.004026 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027067 | Grad Max: 0.027067 [GRADIENT NORM TOTAL] 2.9925 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.256 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5079285 0.4920715] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 70/1978 | B: 220/1828 | C: 237/1811 [LOSS Ex1] A: 0.67990 | B: 0.68052 | C: 0.67540 [LOGITS Ex2 A] Mean Abs: 1.196 | Max: 4.874 [LOSS Ex2] A: 0.30772 | B: 0.43049 | C: 0.37926 ** [JOINT LOSS] ** : 1.051095 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004066 | Grad Max: 0.111483 -> Layer: shared_layers.0.bias | Grad Mean: 0.235180 | Grad Max: 1.311640 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001812 | Grad Max: 0.007516 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001296 | Grad Max: 0.001296 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001827 | Grad Max: 0.109844 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034423 | Grad Max: 0.622982 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000430 | Grad Max: 0.011358 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015915 | Grad Max: 0.062132 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.001030 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003622 | Grad Max: 0.009061 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000406 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001262 | Grad Max: 0.003230 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002823 | Grad Max: 0.005943 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041818 | Grad Max: 0.041818 [GRADIENT NORM TOTAL] 4.7520 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.041 | Max: 0.179 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5129882 0.48701182] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.015 [MASKS] A(Pass/Fail): 63/1985 | B: 211/1837 | C: 235/1813 [LOSS Ex1] A: 0.67945 | B: 0.67831 | C: 0.67781 [LOGITS Ex2 A] Mean Abs: 1.174 | Max: 5.402 [LOSS Ex2] A: 0.30319 | B: 0.42517 | C: 0.43304 ** [JOINT LOSS] ** : 1.065657 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006013 | Grad Max: 0.141921 -> Layer: shared_layers.0.bias | Grad Mean: 0.313207 | Grad Max: 1.685053 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001849 | Grad Max: 0.008414 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002721 | Grad Max: 0.002721 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002451 | Grad Max: 0.149744 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046211 | Grad Max: 0.854790 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000578 | Grad Max: 0.016693 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021360 | Grad Max: 0.083542 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000096 | Grad Max: 0.001172 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004883 | Grad Max: 0.010560 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000524 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001716 | Grad Max: 0.004145 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003893 | Grad Max: 0.007207 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057424 | Grad Max: 0.057424 [GRADIENT NORM TOTAL] 6.2778 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.275 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55076176 0.44923827] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.016 [MASKS] A(Pass/Fail): 92/1956 | B: 193/1663 | C: 255/1793 [LOSS Ex1] A: 0.67826 | B: 0.68044 | C: 0.67458 [LOGITS Ex2 A] Mean Abs: 1.232 | Max: 5.110 [LOSS Ex2] A: 0.28997 | B: 0.41956 | C: 0.38410 ** [JOINT LOSS] ** : 1.042304 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003229 | Grad Max: 0.087235 -> Layer: shared_layers.0.bias | Grad Mean: 0.172311 | Grad Max: 0.910035 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.009594 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012879 | Grad Max: 0.012879 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001366 | Grad Max: 0.107805 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025463 | Grad Max: 0.601625 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000319 | Grad Max: 0.007576 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011800 | Grad Max: 0.043179 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000788 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002692 | Grad Max: 0.006289 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000326 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000933 | Grad Max: 0.002560 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002092 | Grad Max: 0.004458 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030373 | Grad Max: 0.030373 [GRADIENT NORM TOTAL] 3.4894 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.308 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50381637 0.49618366] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 78/1970 | B: 196/1852 | C: 239/1809 [LOSS Ex1] A: 0.68035 | B: 0.68006 | C: 0.67620 [LOGITS Ex2 A] Mean Abs: 1.262 | Max: 4.980 [LOSS Ex2] A: 0.29957 | B: 0.43606 | C: 0.39014 ** [JOINT LOSS] ** : 1.054122 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003148 | Grad Max: 0.069340 -> Layer: shared_layers.0.bias | Grad Mean: 0.177863 | Grad Max: 0.968507 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001846 | Grad Max: 0.008372 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009079 | Grad Max: 0.009079 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001315 | Grad Max: 0.085110 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025155 | Grad Max: 0.478028 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000309 | Grad Max: 0.007631 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011539 | Grad Max: 0.040648 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000694 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002596 | Grad Max: 0.006296 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000303 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000901 | Grad Max: 0.002408 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001819 | Grad Max: 0.003961 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028515 | Grad Max: 0.028515 [GRADIENT NORM TOTAL] 3.5395 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.135 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5488888 0.45111117] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 92/1956 | B: 220/1828 | C: 241/1807 [LOSS Ex1] A: 0.67906 | B: 0.68043 | C: 0.67802 [LOGITS Ex2 A] Mean Abs: 1.277 | Max: 4.860 [LOSS Ex2] A: 0.31183 | B: 0.44001 | C: 0.39232 ** [JOINT LOSS] ** : 1.060558 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004460 | Grad Max: 0.122717 -> Layer: shared_layers.0.bias | Grad Mean: 0.295780 | Grad Max: 1.647182 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001875 | Grad Max: 0.008864 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011857 | Grad Max: 0.011857 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.133095 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041453 | Grad Max: 0.754487 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000505 | Grad Max: 0.013667 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018886 | Grad Max: 0.074256 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.001110 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004261 | Grad Max: 0.010375 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000472 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001483 | Grad Max: 0.003739 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003175 | Grad Max: 0.005972 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048011 | Grad Max: 0.048011 [GRADIENT NORM TOTAL] 5.8857 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.183 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5288698 0.47113016] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.016 [MASKS] A(Pass/Fail): 74/1542 | B: 212/1836 | C: 253/1795 [LOSS Ex1] A: 0.67821 | B: 0.67821 | C: 0.67607 [LOGITS Ex2 A] Mean Abs: 1.296 | Max: 5.046 [LOSS Ex2] A: 0.28063 | B: 0.41805 | C: 0.37991 ** [JOINT LOSS] ** : 1.037028 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002049 | Grad Max: 0.042310 -> Layer: shared_layers.0.bias | Grad Mean: 0.108343 | Grad Max: 0.595349 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001985 | Grad Max: 0.008992 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004178 | Grad Max: 0.004178 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000833 | Grad Max: 0.059042 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015913 | Grad Max: 0.332159 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000195 | Grad Max: 0.005024 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007246 | Grad Max: 0.026657 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000496 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001629 | Grad Max: 0.003978 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000209 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000570 | Grad Max: 0.001601 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001090 | Grad Max: 0.003202 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018355 | Grad Max: 0.018355 [GRADIENT NORM TOTAL] 2.2056 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.309 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50829744 0.49170256] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 96/1952 | B: 193/1663 | C: 228/1820 [LOSS Ex1] A: 0.67851 | B: 0.68035 | C: 0.67774 [LOGITS Ex2 A] Mean Abs: 1.260 | Max: 4.767 [LOSS Ex2] A: 0.29878 | B: 0.41421 | C: 0.41359 ** [JOINT LOSS] ** : 1.054395 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004270 | Grad Max: 0.113107 -> Layer: shared_layers.0.bias | Grad Mean: 0.195671 | Grad Max: 1.004288 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001963 | Grad Max: 0.009674 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015278 | Grad Max: 0.015278 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.090775 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027186 | Grad Max: 0.483506 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000346 | Grad Max: 0.008042 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012803 | Grad Max: 0.047159 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000855 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002925 | Grad Max: 0.007709 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000377 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001018 | Grad Max: 0.002786 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002200 | Grad Max: 0.004331 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033106 | Grad Max: 0.033106 [GRADIENT NORM TOTAL] 3.7783 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.293 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50119853 0.49880144] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 91/1957 | B: 197/1851 | C: 242/1806 [LOSS Ex1] A: 0.67794 | B: 0.67996 | C: 0.67650 [LOGITS Ex2 A] Mean Abs: 1.224 | Max: 5.290 [LOSS Ex2] A: 0.30114 | B: 0.44408 | C: 0.40978 ** [JOINT LOSS] ** : 1.063134 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004819 | Grad Max: 0.131888 -> Layer: shared_layers.0.bias | Grad Mean: 0.305033 | Grad Max: 1.576498 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.009940 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015355 | Grad Max: 0.015355 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002263 | Grad Max: 0.158574 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043038 | Grad Max: 0.888311 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000539 | Grad Max: 0.011950 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020013 | Grad Max: 0.069996 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004550 | Grad Max: 0.011412 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000551 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001600 | Grad Max: 0.004128 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003643 | Grad Max: 0.006800 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053056 | Grad Max: 0.053056 [GRADIENT NORM TOTAL] 6.0458 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.301 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50525594 0.494744 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 103/1945 | B: 220/1828 | C: 195/1181 [LOSS Ex1] A: 0.67705 | B: 0.68034 | C: 0.67382 [LOGITS Ex2 A] Mean Abs: 1.231 | Max: 4.935 [LOSS Ex2] A: 0.30742 | B: 0.42694 | C: 0.38694 ** [JOINT LOSS] ** : 1.050838 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002588 | Grad Max: 0.077523 -> Layer: shared_layers.0.bias | Grad Mean: 0.163215 | Grad Max: 0.890655 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.009295 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005746 | Grad Max: 0.005746 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001179 | Grad Max: 0.083747 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022145 | Grad Max: 0.458802 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000286 | Grad Max: 0.008649 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010636 | Grad Max: 0.046678 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000613 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002387 | Grad Max: 0.006018 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000332 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000835 | Grad Max: 0.002456 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001941 | Grad Max: 0.004187 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028131 | Grad Max: 0.028131 [GRADIENT NORM TOTAL] 3.1367 [EPOCH SUMMARY] Train Loss: 1.0528 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0260 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0270 -> New: 1.0260) ############################## EPOCH 34/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.259 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078804 0.49211955] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 93/1955 | B: 214/1834 | C: 250/1798 [LOSS Ex1] A: 0.67975 | B: 0.67811 | C: 0.67517 [LOGITS Ex2 A] Mean Abs: 1.245 | Max: 4.950 [LOSS Ex2] A: 0.31647 | B: 0.42006 | C: 0.39188 ** [JOINT LOSS] ** : 1.053814 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004379 | Grad Max: 0.107455 -> Layer: shared_layers.0.bias | Grad Mean: 0.234408 | Grad Max: 1.260329 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001893 | Grad Max: 0.007699 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000112 | Grad Max: 0.000112 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001736 | Grad Max: 0.121082 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032792 | Grad Max: 0.691212 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000399 | Grad Max: 0.009938 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014708 | Grad Max: 0.055558 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000801 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003331 | Grad Max: 0.007518 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000376 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001152 | Grad Max: 0.002959 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002353 | Grad Max: 0.004653 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035971 | Grad Max: 0.035971 [GRADIENT NORM TOTAL] 4.5896 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.180 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5132211 0.48677894] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 81/1967 | B: 197/1659 | C: 238/1810 [LOSS Ex1] A: 0.67930 | B: 0.68026 | C: 0.67710 [LOGITS Ex2 A] Mean Abs: 1.247 | Max: 5.412 [LOSS Ex2] A: 0.31126 | B: 0.42162 | C: 0.36530 ** [JOINT LOSS] ** : 1.044942 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006758 | Grad Max: 0.150611 -> Layer: shared_layers.0.bias | Grad Mean: 0.318463 | Grad Max: 1.672163 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001849 | Grad Max: 0.008999 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008211 | Grad Max: 0.008211 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002457 | Grad Max: 0.173345 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046410 | Grad Max: 0.985597 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000572 | Grad Max: 0.012027 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021163 | Grad Max: 0.070962 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.001267 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004822 | Grad Max: 0.011671 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000519 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001687 | Grad Max: 0.004112 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003717 | Grad Max: 0.007253 -> Layer: exit2_layers.12.bias | Grad Mean: 0.055991 | Grad Max: 0.055991 [GRADIENT NORM TOTAL] 6.2287 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.278 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55162746 0.44837254] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.016 [MASKS] A(Pass/Fail): 108/1940 | B: 200/1848 | C: 238/1810 [LOSS Ex1] A: 0.67807 | B: 0.67987 | C: 0.67638 [LOGITS Ex2 A] Mean Abs: 1.267 | Max: 5.032 [LOSS Ex2] A: 0.29426 | B: 0.44134 | C: 0.39657 ** [JOINT LOSS] ** : 1.055498 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004446 | Grad Max: 0.100987 -> Layer: shared_layers.0.bias | Grad Mean: 0.203608 | Grad Max: 1.066842 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.009863 -> Layer: exit1_layers.0.bias | Grad Mean: 0.019461 | Grad Max: 0.019461 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001554 | Grad Max: 0.106671 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029204 | Grad Max: 0.595119 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000362 | Grad Max: 0.009320 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013371 | Grad Max: 0.049809 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000835 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003036 | Grad Max: 0.007745 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000341 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001050 | Grad Max: 0.002750 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002146 | Grad Max: 0.004318 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033270 | Grad Max: 0.033270 [GRADIENT NORM TOTAL] 3.9989 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.311 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50373113 0.49626887] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 91/1957 | B: 222/1826 | C: 254/1794 [LOSS Ex1] A: 0.68019 | B: 0.68025 | C: 0.67549 [LOGITS Ex2 A] Mean Abs: 1.221 | Max: 5.472 [LOSS Ex2] A: 0.29455 | B: 0.42995 | C: 0.39920 ** [JOINT LOSS] ** : 1.053205 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004035 | Grad Max: 0.107047 -> Layer: shared_layers.0.bias | Grad Mean: 0.157306 | Grad Max: 0.876459 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001803 | Grad Max: 0.008117 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005238 | Grad Max: 0.005238 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001286 | Grad Max: 0.112681 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023839 | Grad Max: 0.612380 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000277 | Grad Max: 0.006064 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010242 | Grad Max: 0.032137 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000560 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002358 | Grad Max: 0.005639 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000286 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000818 | Grad Max: 0.002310 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001879 | Grad Max: 0.003717 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026869 | Grad Max: 0.026869 [GRADIENT NORM TOTAL] 3.2339 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.137 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5496462 0.45035383] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 100/1948 | B: 216/1832 | C: 258/1790 [LOSS Ex1] A: 0.67889 | B: 0.67802 | C: 0.67451 [LOGITS Ex2 A] Mean Abs: 1.228 | Max: 4.821 [LOSS Ex2] A: 0.30973 | B: 0.42393 | C: 0.38380 ** [JOINT LOSS] ** : 1.049628 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005211 | Grad Max: 0.128521 -> Layer: shared_layers.0.bias | Grad Mean: 0.224957 | Grad Max: 1.219501 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001992 | Grad Max: 0.008603 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006300 | Grad Max: 0.006300 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001809 | Grad Max: 0.125563 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034334 | Grad Max: 0.702602 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.010143 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015348 | Grad Max: 0.054735 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000947 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003490 | Grad Max: 0.008752 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000399 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001211 | Grad Max: 0.003059 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002809 | Grad Max: 0.005480 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040086 | Grad Max: 0.040086 [GRADIENT NORM TOTAL] 4.5360 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.185 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52938104 0.47061896] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.016 [MASKS] A(Pass/Fail): 88/1528 | B: 198/1658 | C: 266/1782 [LOSS Ex1] A: 0.67804 | B: 0.68017 | C: 0.67467 [LOGITS Ex2 A] Mean Abs: 1.278 | Max: 4.783 [LOSS Ex2] A: 0.29570 | B: 0.40370 | C: 0.37821 ** [JOINT LOSS] ** : 1.036828 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002966 | Grad Max: 0.068507 -> Layer: shared_layers.0.bias | Grad Mean: 0.131578 | Grad Max: 0.702732 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001956 | Grad Max: 0.008981 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003753 | Grad Max: 0.003753 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001043 | Grad Max: 0.070976 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019105 | Grad Max: 0.395847 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000236 | Grad Max: 0.005800 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008676 | Grad Max: 0.030583 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000534 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001988 | Grad Max: 0.005115 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000244 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000686 | Grad Max: 0.001861 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001579 | Grad Max: 0.003527 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022705 | Grad Max: 0.022705 [GRADIENT NORM TOTAL] 2.6316 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.311 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50825953 0.4917405 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 116/1932 | B: 201/1847 | C: 254/1794 [LOSS Ex1] A: 0.67833 | B: 0.67979 | C: 0.67535 [LOGITS Ex2 A] Mean Abs: 1.292 | Max: 6.410 [LOSS Ex2] A: 0.30903 | B: 0.44808 | C: 0.39160 ** [JOINT LOSS] ** : 1.060727 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005096 | Grad Max: 0.115786 -> Layer: shared_layers.0.bias | Grad Mean: 0.253520 | Grad Max: 1.309974 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001958 | Grad Max: 0.009071 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010084 | Grad Max: 0.010084 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001926 | Grad Max: 0.123927 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036408 | Grad Max: 0.713114 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000456 | Grad Max: 0.010030 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016919 | Grad Max: 0.058159 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.001057 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003843 | Grad Max: 0.009263 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000420 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001327 | Grad Max: 0.003427 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002697 | Grad Max: 0.005082 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041661 | Grad Max: 0.041661 [GRADIENT NORM TOTAL] 4.9356 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.295 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50133944 0.4986606 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.016 [MASKS] A(Pass/Fail): 105/1943 | B: 226/1822 | C: 242/1806 [LOSS Ex1] A: 0.67775 | B: 0.68017 | C: 0.67619 [LOGITS Ex2 A] Mean Abs: 1.282 | Max: 4.754 [LOSS Ex2] A: 0.31669 | B: 0.44737 | C: 0.38440 ** [JOINT LOSS] ** : 1.060854 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005837 | Grad Max: 0.146125 -> Layer: shared_layers.0.bias | Grad Mean: 0.318799 | Grad Max: 1.695800 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001949 | Grad Max: 0.009043 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008657 | Grad Max: 0.008657 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002414 | Grad Max: 0.163299 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046000 | Grad Max: 0.908077 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000567 | Grad Max: 0.013148 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021115 | Grad Max: 0.076091 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001214 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004787 | Grad Max: 0.010540 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000540 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001657 | Grad Max: 0.004150 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003548 | Grad Max: 0.006647 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053134 | Grad Max: 0.053134 [GRADIENT NORM TOTAL] 6.3412 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.303 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.505163 0.494837] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.016 [MASKS] A(Pass/Fail): 117/1931 | B: 219/1829 | C: 246/1802 [LOSS Ex1] A: 0.67685 | B: 0.67793 | C: 0.67626 [LOGITS Ex2 A] Mean Abs: 1.278 | Max: 4.819 [LOSS Ex2] A: 0.31787 | B: 0.41816 | C: 0.38984 ** [JOINT LOSS] ** : 1.052305 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005502 | Grad Max: 0.168970 -> Layer: shared_layers.0.bias | Grad Mean: 0.209254 | Grad Max: 1.060540 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002032 | Grad Max: 0.009225 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005864 | Grad Max: 0.005864 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001730 | Grad Max: 0.109938 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031827 | Grad Max: 0.605209 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.009808 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014230 | Grad Max: 0.052972 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000751 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003264 | Grad Max: 0.007411 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000374 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001136 | Grad Max: 0.002903 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002436 | Grad Max: 0.004883 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036425 | Grad Max: 0.036425 [GRADIENT NORM TOTAL] 4.1644 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.260 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078257 0.49217436] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 103/1945 | B: 200/1656 | C: 236/1812 [LOSS Ex1] A: 0.67959 | B: 0.68009 | C: 0.67711 [LOGITS Ex2 A] Mean Abs: 1.216 | Max: 4.896 [LOSS Ex2] A: 0.29993 | B: 0.40986 | C: 0.36804 ** [JOINT LOSS] ** : 1.038203 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003742 | Grad Max: 0.081576 -> Layer: shared_layers.0.bias | Grad Mean: 0.170897 | Grad Max: 0.868119 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001785 | Grad Max: 0.008026 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007177 | Grad Max: 0.007177 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001318 | Grad Max: 0.151885 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024702 | Grad Max: 0.851390 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000300 | Grad Max: 0.008340 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011092 | Grad Max: 0.043843 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000744 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002526 | Grad Max: 0.006176 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000304 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000860 | Grad Max: 0.002336 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001890 | Grad Max: 0.004368 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027059 | Grad Max: 0.027059 [GRADIENT NORM TOTAL] 3.4759 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.181 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51343113 0.4865689 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 89/1959 | B: 202/1846 | C: 247/1801 [LOSS Ex1] A: 0.67914 | B: 0.67970 | C: 0.67476 [LOGITS Ex2 A] Mean Abs: 1.170 | Max: 5.256 [LOSS Ex2] A: 0.30040 | B: 0.45027 | C: 0.37551 ** [JOINT LOSS] ** : 1.053261 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005384 | Grad Max: 0.119924 -> Layer: shared_layers.0.bias | Grad Mean: 0.298249 | Grad Max: 1.579906 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001894 | Grad Max: 0.008577 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003173 | Grad Max: 0.003173 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002280 | Grad Max: 0.141180 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042991 | Grad Max: 0.812237 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000533 | Grad Max: 0.012499 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019751 | Grad Max: 0.068227 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.001123 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004478 | Grad Max: 0.010352 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000483 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001558 | Grad Max: 0.003785 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003531 | Grad Max: 0.006755 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051351 | Grad Max: 0.051351 [GRADIENT NORM TOTAL] 5.9367 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.280 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5524715 0.44752848] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 119/1929 | B: 227/1821 | C: 234/1814 [LOSS Ex1] A: 0.67788 | B: 0.68009 | C: 0.67716 [LOGITS Ex2 A] Mean Abs: 1.235 | Max: 5.066 [LOSS Ex2] A: 0.28100 | B: 0.42811 | C: 0.39771 ** [JOINT LOSS] ** : 1.047317 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002696 | Grad Max: 0.071633 -> Layer: shared_layers.0.bias | Grad Mean: 0.153977 | Grad Max: 0.766755 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002019 | Grad Max: 0.010007 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016817 | Grad Max: 0.016817 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001115 | Grad Max: 0.080677 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020492 | Grad Max: 0.462606 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000251 | Grad Max: 0.006561 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009334 | Grad Max: 0.035382 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000638 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002127 | Grad Max: 0.005121 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000270 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000737 | Grad Max: 0.002041 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001720 | Grad Max: 0.003892 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024444 | Grad Max: 0.024444 [GRADIENT NORM TOTAL] 2.9395 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.313 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50365096 0.49634904] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.016 [MASKS] A(Pass/Fail): 104/1944 | B: 220/1828 | C: 246/1802 [LOSS Ex1] A: 0.68003 | B: 0.67784 | C: 0.67483 [LOGITS Ex2 A] Mean Abs: 1.266 | Max: 4.884 [LOSS Ex2] A: 0.29014 | B: 0.42044 | C: 0.36364 ** [JOINT LOSS] ** : 1.035643 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003090 | Grad Max: 0.088830 -> Layer: shared_layers.0.bias | Grad Mean: 0.191854 | Grad Max: 1.089912 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001967 | Grad Max: 0.008929 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009268 | Grad Max: 0.009268 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001449 | Grad Max: 0.103132 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027769 | Grad Max: 0.585972 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000340 | Grad Max: 0.007929 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012789 | Grad Max: 0.043262 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000746 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002878 | Grad Max: 0.006679 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000360 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001009 | Grad Max: 0.002692 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002094 | Grad Max: 0.005224 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033232 | Grad Max: 0.033232 [GRADIENT NORM TOTAL] 3.8928 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.139 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55033994 0.44966003] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.016 [MASKS] A(Pass/Fail): 112/1936 | B: 201/1655 | C: 181/1195 [LOSS Ex1] A: 0.67872 | B: 0.68000 | C: 0.67394 [LOGITS Ex2 A] Mean Abs: 1.277 | Max: 5.023 [LOSS Ex2] A: 0.31968 | B: 0.41642 | C: 0.37609 ** [JOINT LOSS] ** : 1.048287 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004971 | Grad Max: 0.118842 -> Layer: shared_layers.0.bias | Grad Mean: 0.234652 | Grad Max: 1.271636 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001901 | Grad Max: 0.008545 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004607 | Grad Max: 0.004607 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001820 | Grad Max: 0.112619 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034201 | Grad Max: 0.632753 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000417 | Grad Max: 0.009756 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015478 | Grad Max: 0.054966 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000888 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003514 | Grad Max: 0.008239 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000400 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001223 | Grad Max: 0.003033 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002659 | Grad Max: 0.005231 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039831 | Grad Max: 0.039831 [GRADIENT NORM TOTAL] 4.6561 [EPOCH SUMMARY] Train Loss: 1.0493 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0183 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0260 -> New: 1.0183) ############################## EPOCH 35/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.187 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52984667 0.47015333] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 94/1522 | B: 203/1845 | C: 248/1800 [LOSS Ex1] A: 0.67786 | B: 0.67962 | C: 0.67646 [LOGITS Ex2 A] Mean Abs: 1.279 | Max: 4.915 [LOSS Ex2] A: 0.29040 | B: 0.43804 | C: 0.37076 ** [JOINT LOSS] ** : 1.044381 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001563 | Grad Max: 0.030085 -> Layer: shared_layers.0.bias | Grad Mean: 0.075450 | Grad Max: 0.418441 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001922 | Grad Max: 0.009033 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006612 | Grad Max: 0.006612 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000587 | Grad Max: 0.035372 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010857 | Grad Max: 0.194953 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000139 | Grad Max: 0.005562 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005000 | Grad Max: 0.022008 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000401 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001087 | Grad Max: 0.003004 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000172 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000376 | Grad Max: 0.001103 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000619 | Grad Max: 0.002163 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010686 | Grad Max: 0.010686 [GRADIENT NORM TOTAL] 1.4975 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.314 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50823516 0.49176484] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 130/1918 | B: 228/1820 | C: 277/1771 [LOSS Ex1] A: 0.67815 | B: 0.68000 | C: 0.67330 [LOGITS Ex2 A] Mean Abs: 1.249 | Max: 5.532 [LOSS Ex2] A: 0.29117 | B: 0.43544 | C: 0.36898 ** [JOINT LOSS] ** : 1.042346 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006095 | Grad Max: 0.139372 -> Layer: shared_layers.0.bias | Grad Mean: 0.273130 | Grad Max: 1.433512 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001991 | Grad Max: 0.009033 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008566 | Grad Max: 0.008566 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.133811 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039543 | Grad Max: 0.752272 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000494 | Grad Max: 0.012022 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018337 | Grad Max: 0.068157 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.001042 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004176 | Grad Max: 0.009696 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000477 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001442 | Grad Max: 0.003656 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003219 | Grad Max: 0.005833 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046970 | Grad Max: 0.046970 [GRADIENT NORM TOTAL] 5.2965 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.298 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014685 0.4985315] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 116/1932 | B: 223/1825 | C: 236/1812 [LOSS Ex1] A: 0.67755 | B: 0.67774 | C: 0.67725 [LOGITS Ex2 A] Mean Abs: 1.231 | Max: 5.023 [LOSS Ex2] A: 0.29323 | B: 0.43406 | C: 0.42610 ** [JOINT LOSS] ** : 1.061977 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006446 | Grad Max: 0.150730 -> Layer: shared_layers.0.bias | Grad Mean: 0.353738 | Grad Max: 1.852015 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.009474 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011181 | Grad Max: 0.011181 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002637 | Grad Max: 0.173118 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050257 | Grad Max: 0.973984 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000620 | Grad Max: 0.015506 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023156 | Grad Max: 0.088698 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001204 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005246 | Grad Max: 0.011898 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000648 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001827 | Grad Max: 0.004659 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003988 | Grad Max: 0.007770 -> Layer: exit2_layers.12.bias | Grad Mean: 0.060012 | Grad Max: 0.060012 [GRADIENT NORM TOTAL] 6.8889 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.305 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5050696 0.4949304] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 129/1919 | B: 205/1651 | C: 256/1792 [LOSS Ex1] A: 0.67665 | B: 0.67991 | C: 0.67440 [LOGITS Ex2 A] Mean Abs: 1.232 | Max: 5.602 [LOSS Ex2] A: 0.30541 | B: 0.41673 | C: 0.39836 ** [JOINT LOSS] ** : 1.050486 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003481 | Grad Max: 0.098876 -> Layer: shared_layers.0.bias | Grad Mean: 0.226729 | Grad Max: 1.224972 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.010110 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011253 | Grad Max: 0.011253 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001644 | Grad Max: 0.113753 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030588 | Grad Max: 0.635706 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000380 | Grad Max: 0.009767 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014227 | Grad Max: 0.054944 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000882 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003203 | Grad Max: 0.008133 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000354 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001105 | Grad Max: 0.002839 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002338 | Grad Max: 0.004439 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036090 | Grad Max: 0.036090 [GRADIENT NORM TOTAL] 4.4433 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.262 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5077407 0.4922593] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 112/1936 | B: 206/1842 | C: 229/1819 [LOSS Ex1] A: 0.67942 | B: 0.67952 | C: 0.67721 [LOGITS Ex2 A] Mean Abs: 1.249 | Max: 4.937 [LOSS Ex2] A: 0.29558 | B: 0.44443 | C: 0.40375 ** [JOINT LOSS] ** : 1.059971 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.061418 -> Layer: shared_layers.0.bias | Grad Mean: 0.106882 | Grad Max: 0.559651 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001778 | Grad Max: 0.007779 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004112 | Grad Max: 0.004112 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000765 | Grad Max: 0.064443 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014378 | Grad Max: 0.360947 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000175 | Grad Max: 0.005788 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006547 | Grad Max: 0.029952 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000429 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001469 | Grad Max: 0.003620 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000202 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000493 | Grad Max: 0.001494 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000785 | Grad Max: 0.002139 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014007 | Grad Max: 0.014007 [GRADIENT NORM TOTAL] 2.0615 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.042 | Max: 0.183 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5136884 0.48631167] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.016 [MASKS] A(Pass/Fail): 95/1953 | B: 230/1818 | C: 233/1815 [LOSS Ex1] A: 0.67897 | B: 0.67990 | C: 0.67725 [LOGITS Ex2 A] Mean Abs: 1.233 | Max: 5.125 [LOSS Ex2] A: 0.30559 | B: 0.43234 | C: 0.37308 ** [JOINT LOSS] ** : 1.049042 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004731 | Grad Max: 0.123474 -> Layer: shared_layers.0.bias | Grad Mean: 0.229077 | Grad Max: 1.168384 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001778 | Grad Max: 0.008168 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005577 | Grad Max: 0.005577 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001705 | Grad Max: 0.122931 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032045 | Grad Max: 0.684206 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000384 | Grad Max: 0.010760 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014311 | Grad Max: 0.052035 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.001022 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003263 | Grad Max: 0.008421 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000375 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001138 | Grad Max: 0.002881 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002411 | Grad Max: 0.005314 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036862 | Grad Max: 0.036862 [GRADIENT NORM TOTAL] 4.4591 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.283 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.553426 0.44657397] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 130/1918 | B: 224/1824 | C: 231/1817 [LOSS Ex1] A: 0.67768 | B: 0.67764 | C: 0.67764 [LOGITS Ex2 A] Mean Abs: 1.256 | Max: 5.045 [LOSS Ex2] A: 0.29977 | B: 0.40909 | C: 0.37523 ** [JOINT LOSS] ** : 1.039014 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003689 | Grad Max: 0.102090 -> Layer: shared_layers.0.bias | Grad Mean: 0.133377 | Grad Max: 0.648451 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.010188 -> Layer: exit1_layers.0.bias | Grad Mean: 0.022089 | Grad Max: 0.022089 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001013 | Grad Max: 0.103811 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018843 | Grad Max: 0.558659 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000224 | Grad Max: 0.005189 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008319 | Grad Max: 0.027665 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000508 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001912 | Grad Max: 0.004426 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000223 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000664 | Grad Max: 0.001657 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001272 | Grad Max: 0.003792 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020873 | Grad Max: 0.020873 [GRADIENT NORM TOTAL] 2.6221 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.316 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5035428 0.49645725] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 123/1925 | B: 208/1648 | C: 241/1807 [LOSS Ex1] A: 0.67985 | B: 0.67981 | C: 0.67531 [LOGITS Ex2 A] Mean Abs: 1.221 | Max: 5.030 [LOSS Ex2] A: 0.28479 | B: 0.41326 | C: 0.38621 ** [JOINT LOSS] ** : 1.039744 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005673 | Grad Max: 0.149941 -> Layer: shared_layers.0.bias | Grad Mean: 0.278923 | Grad Max: 1.426606 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001796 | Grad Max: 0.008077 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004880 | Grad Max: 0.004880 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.142388 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039315 | Grad Max: 0.777520 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000480 | Grad Max: 0.011507 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017874 | Grad Max: 0.063502 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.001050 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004089 | Grad Max: 0.010115 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000445 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001407 | Grad Max: 0.003543 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003036 | Grad Max: 0.006414 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045080 | Grad Max: 0.045080 [GRADIENT NORM TOTAL] 5.4257 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.142 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5512097 0.44879034] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 128/1920 | B: 208/1840 | C: 239/1809 [LOSS Ex1] A: 0.67852 | B: 0.67942 | C: 0.67608 [LOGITS Ex2 A] Mean Abs: 1.212 | Max: 5.201 [LOSS Ex2] A: 0.30485 | B: 0.44673 | C: 0.39836 ** [JOINT LOSS] ** : 1.061323 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006416 | Grad Max: 0.143084 -> Layer: shared_layers.0.bias | Grad Mean: 0.339486 | Grad Max: 1.778114 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001939 | Grad Max: 0.009216 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009543 | Grad Max: 0.009543 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002573 | Grad Max: 0.169450 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048560 | Grad Max: 0.940086 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000588 | Grad Max: 0.013782 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021975 | Grad Max: 0.078503 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001185 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005002 | Grad Max: 0.011419 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000546 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001739 | Grad Max: 0.004428 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003868 | Grad Max: 0.007210 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057117 | Grad Max: 0.057117 [GRADIENT NORM TOTAL] 6.6589 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.190 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53040534 0.46959463] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 97/1519 | B: 231/1817 | C: 248/1800 [LOSS Ex1] A: 0.67766 | B: 0.67980 | C: 0.67542 [LOGITS Ex2 A] Mean Abs: 1.272 | Max: 5.591 [LOSS Ex2] A: 0.27735 | B: 0.43447 | C: 0.36613 ** [JOINT LOSS] ** : 1.036943 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004046 | Grad Max: 0.108586 -> Layer: shared_layers.0.bias | Grad Mean: 0.189315 | Grad Max: 0.965502 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001928 | Grad Max: 0.008774 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004004 | Grad Max: 0.004004 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001443 | Grad Max: 0.100706 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027008 | Grad Max: 0.576818 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000339 | Grad Max: 0.008150 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012626 | Grad Max: 0.046838 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000745 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002887 | Grad Max: 0.006590 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000350 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000998 | Grad Max: 0.002639 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002312 | Grad Max: 0.005010 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033397 | Grad Max: 0.033397 [GRADIENT NORM TOTAL] 3.7242 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.317 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081723 0.49182773] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 139/1909 | B: 226/1822 | C: 272/1776 [LOSS Ex1] A: 0.67795 | B: 0.67753 | C: 0.67301 [LOGITS Ex2 A] Mean Abs: 1.286 | Max: 6.361 [LOSS Ex2] A: 0.30353 | B: 0.42520 | C: 0.37760 ** [JOINT LOSS] ** : 1.044940 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003178 | Grad Max: 0.081341 -> Layer: shared_layers.0.bias | Grad Mean: 0.201103 | Grad Max: 1.057626 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.009327 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008535 | Grad Max: 0.008535 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001542 | Grad Max: 0.110072 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028976 | Grad Max: 0.615784 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.009273 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013601 | Grad Max: 0.050226 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000797 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003084 | Grad Max: 0.007945 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000363 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001073 | Grad Max: 0.002717 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002243 | Grad Max: 0.004718 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035319 | Grad Max: 0.035319 [GRADIENT NORM TOTAL] 4.0780 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.300 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50163746 0.49836257] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 125/1923 | B: 210/1646 | C: 266/1782 [LOSS Ex1] A: 0.67733 | B: 0.67971 | C: 0.67364 [LOGITS Ex2 A] Mean Abs: 1.292 | Max: 4.953 [LOSS Ex2] A: 0.30004 | B: 0.41187 | C: 0.35928 ** [JOINT LOSS] ** : 1.033964 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005810 | Grad Max: 0.144451 -> Layer: shared_layers.0.bias | Grad Mean: 0.276216 | Grad Max: 1.505029 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001990 | Grad Max: 0.009346 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009014 | Grad Max: 0.009014 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.158805 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040373 | Grad Max: 0.888416 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000498 | Grad Max: 0.012024 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018577 | Grad Max: 0.068374 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.001035 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004217 | Grad Max: 0.009759 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000460 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001462 | Grad Max: 0.003576 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003030 | Grad Max: 0.006027 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047236 | Grad Max: 0.047236 [GRADIENT NORM TOTAL] 5.4737 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.308 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50492465 0.49507535] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 135/1913 | B: 209/1839 | C: 266/1782 [LOSS Ex1] A: 0.67644 | B: 0.67933 | C: 0.67224 [LOGITS Ex2 A] Mean Abs: 1.281 | Max: 5.661 [LOSS Ex2] A: 0.31135 | B: 0.44342 | C: 0.37520 ** [JOINT LOSS] ** : 1.052659 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003027 | Grad Max: 0.099241 -> Layer: shared_layers.0.bias | Grad Mean: 0.134324 | Grad Max: 0.673751 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.008819 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000305 | Grad Max: 0.000305 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001083 | Grad Max: 0.070415 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019928 | Grad Max: 0.403514 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000239 | Grad Max: 0.007091 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008853 | Grad Max: 0.035058 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000552 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002011 | Grad Max: 0.004873 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000240 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000692 | Grad Max: 0.001909 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001383 | Grad Max: 0.003385 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021986 | Grad Max: 0.021986 [GRADIENT NORM TOTAL] 2.6691 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.264 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50762796 0.49237207] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.516 | Std: 0.017 [MASKS] A(Pass/Fail): 117/1931 | B: 234/1814 | C: 161/1215 [LOSS Ex1] A: 0.67926 | B: 0.67971 | C: 0.67595 [LOGITS Ex2 A] Mean Abs: 1.225 | Max: 5.923 [LOSS Ex2] A: 0.29873 | B: 0.42515 | C: 0.39047 ** [JOINT LOSS] ** : 1.049757 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004036 | Grad Max: 0.107878 -> Layer: shared_layers.0.bias | Grad Mean: 0.158279 | Grad Max: 0.839885 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001810 | Grad Max: 0.007849 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007285 | Grad Max: 0.007285 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001218 | Grad Max: 0.090999 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022895 | Grad Max: 0.506899 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000280 | Grad Max: 0.006909 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010282 | Grad Max: 0.035414 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000653 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002339 | Grad Max: 0.005785 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000292 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000814 | Grad Max: 0.002188 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001942 | Grad Max: 0.004225 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027304 | Grad Max: 0.027304 [GRADIENT NORM TOTAL] 3.0835 [EPOCH SUMMARY] Train Loss: 1.0476 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0234 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 36/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.184 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5139389 0.48606113] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.516 | Std: 0.017 [MASKS] A(Pass/Fail): 108/1940 | B: 227/1821 | C: 232/1816 [LOSS Ex1] A: 0.67880 | B: 0.67743 | C: 0.67608 [LOGITS Ex2 A] Mean Abs: 1.185 | Max: 5.532 [LOSS Ex2] A: 0.29922 | B: 0.41500 | C: 0.41483 ** [JOINT LOSS] ** : 1.053787 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004926 | Grad Max: 0.121432 -> Layer: shared_layers.0.bias | Grad Mean: 0.275188 | Grad Max: 1.496322 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001869 | Grad Max: 0.007957 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001144 | Grad Max: 0.001144 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.141121 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039263 | Grad Max: 0.776415 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000492 | Grad Max: 0.012161 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018403 | Grad Max: 0.066386 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000988 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004153 | Grad Max: 0.009627 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000429 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001445 | Grad Max: 0.003614 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003266 | Grad Max: 0.006287 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048589 | Grad Max: 0.048589 [GRADIENT NORM TOTAL] 5.4781 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.285 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55438626 0.44561374] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.017 [MASKS] A(Pass/Fail): 139/1909 | B: 212/1644 | C: 232/1816 [LOSS Ex1] A: 0.67748 | B: 0.67962 | C: 0.67607 [LOGITS Ex2 A] Mean Abs: 1.245 | Max: 5.204 [LOSS Ex2] A: 0.27542 | B: 0.41253 | C: 0.37893 ** [JOINT LOSS] ** : 1.033349 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002707 | Grad Max: 0.073690 -> Layer: shared_layers.0.bias | Grad Mean: 0.108300 | Grad Max: 0.551556 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.009413 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015293 | Grad Max: 0.015293 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000871 | Grad Max: 0.064781 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016040 | Grad Max: 0.360474 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000197 | Grad Max: 0.004897 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007353 | Grad Max: 0.027554 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000486 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001676 | Grad Max: 0.004207 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000224 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000562 | Grad Max: 0.001805 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001149 | Grad Max: 0.003398 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016505 | Grad Max: 0.016505 [GRADIENT NORM TOTAL] 2.1905 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.318 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.503435 0.49656495] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 133/1915 | B: 210/1838 | C: 235/1813 [LOSS Ex1] A: 0.67969 | B: 0.67923 | C: 0.67557 [LOGITS Ex2 A] Mean Abs: 1.286 | Max: 4.677 [LOSS Ex2] A: 0.30635 | B: 0.44438 | C: 0.37888 ** [JOINT LOSS] ** : 1.054698 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004389 | Grad Max: 0.096634 -> Layer: shared_layers.0.bias | Grad Mean: 0.245016 | Grad Max: 1.294692 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001818 | Grad Max: 0.008176 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005974 | Grad Max: 0.005974 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001790 | Grad Max: 0.119895 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034504 | Grad Max: 0.675913 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000427 | Grad Max: 0.009924 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016000 | Grad Max: 0.054855 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000892 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003573 | Grad Max: 0.008255 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000387 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001228 | Grad Max: 0.003182 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002458 | Grad Max: 0.004914 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039258 | Grad Max: 0.039258 [GRADIENT NORM TOTAL] 4.7717 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.144 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55197996 0.44802004] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 139/1909 | B: 235/1813 | C: 279/1769 [LOSS Ex1] A: 0.67835 | B: 0.67962 | C: 0.67351 [LOGITS Ex2 A] Mean Abs: 1.312 | Max: 4.963 [LOSS Ex2] A: 0.31751 | B: 0.44967 | C: 0.37083 ** [JOINT LOSS] ** : 1.056496 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007120 | Grad Max: 0.161422 -> Layer: shared_layers.0.bias | Grad Mean: 0.380309 | Grad Max: 2.020315 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001937 | Grad Max: 0.008580 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005977 | Grad Max: 0.005977 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002886 | Grad Max: 0.211913 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054354 | Grad Max: 1.154563 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000660 | Grad Max: 0.016506 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024773 | Grad Max: 0.090863 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000107 | Grad Max: 0.001214 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005613 | Grad Max: 0.012186 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000585 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001941 | Grad Max: 0.004813 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004138 | Grad Max: 0.007507 -> Layer: exit2_layers.12.bias | Grad Mean: 0.062837 | Grad Max: 0.062837 [GRADIENT NORM TOTAL] 7.5744 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.193 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53087866 0.46912134] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 104/1512 | B: 228/1820 | C: 252/1796 [LOSS Ex1] A: 0.67748 | B: 0.67733 | C: 0.67539 [LOGITS Ex2 A] Mean Abs: 1.320 | Max: 4.946 [LOSS Ex2] A: 0.27882 | B: 0.41897 | C: 0.37958 ** [JOINT LOSS] ** : 1.035863 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004251 | Grad Max: 0.094889 -> Layer: shared_layers.0.bias | Grad Mean: 0.213091 | Grad Max: 1.146776 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.008965 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003737 | Grad Max: 0.003737 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001632 | Grad Max: 0.117058 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030936 | Grad Max: 0.647550 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000377 | Grad Max: 0.009338 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014190 | Grad Max: 0.053809 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000798 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003216 | Grad Max: 0.007907 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000363 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001108 | Grad Max: 0.002863 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002198 | Grad Max: 0.004890 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035287 | Grad Max: 0.035287 [GRADIENT NORM TOTAL] 4.2376 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.319 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50814706 0.49185294] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 158/1890 | B: 213/1643 | C: 222/1826 [LOSS Ex1] A: 0.67776 | B: 0.67953 | C: 0.67832 [LOGITS Ex2 A] Mean Abs: 1.276 | Max: 6.043 [LOSS Ex2] A: 0.29040 | B: 0.40020 | C: 0.37781 ** [JOINT LOSS] ** : 1.034677 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001953 | Grad Max: 0.053544 -> Layer: shared_layers.0.bias | Grad Mean: 0.124016 | Grad Max: 0.657545 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001963 | Grad Max: 0.009240 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016076 | Grad Max: 0.016076 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000921 | Grad Max: 0.088004 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017086 | Grad Max: 0.502687 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000214 | Grad Max: 0.005921 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008020 | Grad Max: 0.031724 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000589 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001776 | Grad Max: 0.005229 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000226 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000588 | Grad Max: 0.001767 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001164 | Grad Max: 0.003162 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017738 | Grad Max: 0.017738 [GRADIENT NORM TOTAL] 2.5179 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.302 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017598 0.49824017] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 143/1905 | B: 210/1838 | C: 255/1793 [LOSS Ex1] A: 0.67713 | B: 0.67914 | C: 0.67378 [LOGITS Ex2 A] Mean Abs: 1.243 | Max: 5.288 [LOSS Ex2] A: 0.28573 | B: 0.44073 | C: 0.37953 ** [JOINT LOSS] ** : 1.045348 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003359 | Grad Max: 0.088366 -> Layer: shared_layers.0.bias | Grad Mean: 0.206118 | Grad Max: 1.126014 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.009603 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012611 | Grad Max: 0.012611 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001556 | Grad Max: 0.125906 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029398 | Grad Max: 0.716743 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.008947 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013590 | Grad Max: 0.045761 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000767 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003076 | Grad Max: 0.007310 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000375 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001058 | Grad Max: 0.002841 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002287 | Grad Max: 0.004686 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033513 | Grad Max: 0.033513 [GRADIENT NORM TOTAL] 4.1738 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.310 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50483876 0.49516127] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 148/1900 | B: 236/1812 | C: 253/1795 [LOSS Ex1] A: 0.67624 | B: 0.67953 | C: 0.67532 [LOGITS Ex2 A] Mean Abs: 1.261 | Max: 5.581 [LOSS Ex2] A: 0.30440 | B: 0.43428 | C: 0.38089 ** [JOINT LOSS] ** : 1.050221 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.057725 -> Layer: shared_layers.0.bias | Grad Mean: 0.109517 | Grad Max: 0.573164 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001995 | Grad Max: 0.009648 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009611 | Grad Max: 0.009611 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000815 | Grad Max: 0.050640 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014476 | Grad Max: 0.278997 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000169 | Grad Max: 0.005770 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006298 | Grad Max: 0.029548 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000374 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001367 | Grad Max: 0.003672 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000158 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000471 | Grad Max: 0.001298 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001095 | Grad Max: 0.002546 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016475 | Grad Max: 0.016475 [GRADIENT NORM TOTAL] 2.1148 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.266 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5075457 0.49245432] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 132/1916 | B: 228/1820 | C: 223/1825 [LOSS Ex1] A: 0.67910 | B: 0.67723 | C: 0.67638 [LOGITS Ex2 A] Mean Abs: 1.283 | Max: 4.873 [LOSS Ex2] A: 0.30838 | B: 0.41267 | C: 0.36695 ** [JOINT LOSS] ** : 1.040237 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003474 | Grad Max: 0.090508 -> Layer: shared_layers.0.bias | Grad Mean: 0.241078 | Grad Max: 1.272005 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001860 | Grad Max: 0.007980 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003534 | Grad Max: 0.003534 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001712 | Grad Max: 0.147137 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032492 | Grad Max: 0.800623 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000402 | Grad Max: 0.010842 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015118 | Grad Max: 0.060125 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000841 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003399 | Grad Max: 0.007823 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000350 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001176 | Grad Max: 0.002915 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002413 | Grad Max: 0.005470 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038288 | Grad Max: 0.038288 [GRADIENT NORM TOTAL] 4.7784 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.043 | Max: 0.185 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51418275 0.48581725] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 117/1931 | B: 213/1643 | C: 259/1789 [LOSS Ex1] A: 0.67864 | B: 0.67943 | C: 0.67458 [LOGITS Ex2 A] Mean Abs: 1.269 | Max: 5.427 [LOSS Ex2] A: 0.30513 | B: 0.41858 | C: 0.37531 ** [JOINT LOSS] ** : 1.043895 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005423 | Grad Max: 0.165969 -> Layer: shared_layers.0.bias | Grad Mean: 0.315766 | Grad Max: 1.719379 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.008570 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006540 | Grad Max: 0.006540 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002260 | Grad Max: 0.155204 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043127 | Grad Max: 0.875954 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.013126 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020235 | Grad Max: 0.074434 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.001121 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004548 | Grad Max: 0.010600 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000528 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001566 | Grad Max: 0.003980 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003207 | Grad Max: 0.006345 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050092 | Grad Max: 0.050092 [GRADIENT NORM TOTAL] 6.1548 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.288 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55529684 0.44470316] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 148/1900 | B: 212/1836 | C: 271/1777 [LOSS Ex1] A: 0.67728 | B: 0.67904 | C: 0.67252 [LOGITS Ex2 A] Mean Abs: 1.281 | Max: 5.149 [LOSS Ex2] A: 0.28695 | B: 0.43670 | C: 0.37515 ** [JOINT LOSS] ** : 1.042546 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003259 | Grad Max: 0.076845 -> Layer: shared_layers.0.bias | Grad Mean: 0.162561 | Grad Max: 0.871794 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.009947 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012810 | Grad Max: 0.012810 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001193 | Grad Max: 0.101807 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022475 | Grad Max: 0.566475 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.008079 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010701 | Grad Max: 0.047683 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000602 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002409 | Grad Max: 0.005780 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000278 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000828 | Grad Max: 0.002333 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001594 | Grad Max: 0.003697 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025946 | Grad Max: 0.025946 [GRADIENT NORM TOTAL] 3.2152 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.321 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5033245 0.4966755] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 145/1903 | B: 237/1811 | C: 248/1800 [LOSS Ex1] A: 0.67952 | B: 0.67944 | C: 0.67403 [LOGITS Ex2 A] Mean Abs: 1.266 | Max: 4.958 [LOSS Ex2] A: 0.27446 | B: 0.42749 | C: 0.38047 ** [JOINT LOSS] ** : 1.038471 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006177 | Grad Max: 0.174581 -> Layer: shared_layers.0.bias | Grad Mean: 0.240628 | Grad Max: 1.227337 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001831 | Grad Max: 0.007859 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002768 | Grad Max: 0.002768 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001970 | Grad Max: 0.117773 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036133 | Grad Max: 0.685182 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.011862 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015995 | Grad Max: 0.059784 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000851 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003675 | Grad Max: 0.008413 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000397 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001264 | Grad Max: 0.003112 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002818 | Grad Max: 0.005621 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041233 | Grad Max: 0.041233 [GRADIENT NORM TOTAL] 4.7519 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.146 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55277413 0.44722587] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 147/1901 | B: 230/1818 | C: 270/1778 [LOSS Ex1] A: 0.67817 | B: 0.67714 | C: 0.67527 [LOGITS Ex2 A] Mean Abs: 1.260 | Max: 5.148 [LOSS Ex2] A: 0.30453 | B: 0.41302 | C: 0.36701 ** [JOINT LOSS] ** : 1.038380 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007194 | Grad Max: 0.159868 -> Layer: shared_layers.0.bias | Grad Mean: 0.294359 | Grad Max: 1.556589 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.009250 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012213 | Grad Max: 0.012213 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002358 | Grad Max: 0.198900 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043952 | Grad Max: 1.112202 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000529 | Grad Max: 0.013079 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019772 | Grad Max: 0.069388 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.001016 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004516 | Grad Max: 0.009847 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000483 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001547 | Grad Max: 0.003809 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003417 | Grad Max: 0.006690 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050565 | Grad Max: 0.050565 [GRADIENT NORM TOTAL] 5.8872 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.195 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53141266 0.46858734] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.017 [MASKS] A(Pass/Fail): 111/1505 | B: 216/1640 | C: 176/1200 [LOSS Ex1] A: 0.67730 | B: 0.67935 | C: 0.67259 [LOGITS Ex2 A] Mean Abs: 1.274 | Max: 4.854 [LOSS Ex2] A: 0.28379 | B: 0.41307 | C: 0.41816 ** [JOINT LOSS] ** : 1.048084 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005417 | Grad Max: 0.145894 -> Layer: shared_layers.0.bias | Grad Mean: 0.207262 | Grad Max: 1.046324 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001996 | Grad Max: 0.008272 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002261 | Grad Max: 0.002261 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001680 | Grad Max: 0.124984 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030880 | Grad Max: 0.674745 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000383 | Grad Max: 0.009515 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014125 | Grad Max: 0.049476 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000739 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003236 | Grad Max: 0.006965 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000395 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001124 | Grad Max: 0.003278 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002647 | Grad Max: 0.004996 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038076 | Grad Max: 0.038076 [GRADIENT NORM TOTAL] 4.1000 [EPOCH SUMMARY] Train Loss: 1.0440 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0164 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0183 -> New: 1.0164) ############################## EPOCH 37/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.322 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50807244 0.49192753] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.018 [MASKS] A(Pass/Fail): 172/1876 | B: 213/1835 | C: 241/1807 [LOSS Ex1] A: 0.67758 | B: 0.67895 | C: 0.67541 [LOGITS Ex2 A] Mean Abs: 1.325 | Max: 6.232 [LOSS Ex2] A: 0.28962 | B: 0.44537 | C: 0.37087 ** [JOINT LOSS] ** : 1.045933 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003298 | Grad Max: 0.073755 -> Layer: shared_layers.0.bias | Grad Mean: 0.194683 | Grad Max: 1.030853 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001976 | Grad Max: 0.009231 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011654 | Grad Max: 0.011654 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001421 | Grad Max: 0.105620 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026897 | Grad Max: 0.588506 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000334 | Grad Max: 0.008185 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012638 | Grad Max: 0.046387 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000736 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002839 | Grad Max: 0.006741 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000324 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000974 | Grad Max: 0.002499 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001927 | Grad Max: 0.004268 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031068 | Grad Max: 0.031068 [GRADIENT NORM TOTAL] 3.8182 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.305 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50192165 0.49807835] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.018 [MASKS] A(Pass/Fail): 150/1898 | B: 239/1809 | C: 242/1806 [LOSS Ex1] A: 0.67694 | B: 0.67935 | C: 0.67489 [LOGITS Ex2 A] Mean Abs: 1.315 | Max: 4.916 [LOSS Ex2] A: 0.29578 | B: 0.42771 | C: 0.38293 ** [JOINT LOSS] ** : 1.045865 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005788 | Grad Max: 0.150625 -> Layer: shared_layers.0.bias | Grad Mean: 0.307069 | Grad Max: 1.631666 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.009497 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012383 | Grad Max: 0.012383 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.188430 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043043 | Grad Max: 1.037250 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000525 | Grad Max: 0.014006 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019927 | Grad Max: 0.075805 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001068 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004490 | Grad Max: 0.010405 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000462 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001536 | Grad Max: 0.003853 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003052 | Grad Max: 0.006041 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048466 | Grad Max: 0.048466 [GRADIENT NORM TOTAL] 6.0411 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.313 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5047155 0.49528447] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.018 [MASKS] A(Pass/Fail): 157/1891 | B: 234/1814 | C: 256/1792 [LOSS Ex1] A: 0.67605 | B: 0.67704 | C: 0.67379 [LOGITS Ex2 A] Mean Abs: 1.294 | Max: 5.183 [LOSS Ex2] A: 0.29794 | B: 0.42009 | C: 0.38327 ** [JOINT LOSS] ** : 1.042727 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004457 | Grad Max: 0.145948 -> Layer: shared_layers.0.bias | Grad Mean: 0.159860 | Grad Max: 0.832347 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.009548 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006169 | Grad Max: 0.006169 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001336 | Grad Max: 0.087666 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024643 | Grad Max: 0.446413 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000300 | Grad Max: 0.007554 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011238 | Grad Max: 0.036524 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000697 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002566 | Grad Max: 0.006711 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000316 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000880 | Grad Max: 0.002405 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001744 | Grad Max: 0.003954 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027709 | Grad Max: 0.027709 [GRADIENT NORM TOTAL] 3.2030 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.268 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50745606 0.49254394] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 142/1906 | B: 217/1639 | C: 240/1808 [LOSS Ex1] A: 0.67894 | B: 0.67926 | C: 0.67616 [LOGITS Ex2 A] Mean Abs: 1.229 | Max: 5.066 [LOSS Ex2] A: 0.29064 | B: 0.40624 | C: 0.39409 ** [JOINT LOSS] ** : 1.041782 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004421 | Grad Max: 0.086008 -> Layer: shared_layers.0.bias | Grad Mean: 0.195523 | Grad Max: 0.967803 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001822 | Grad Max: 0.008174 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008061 | Grad Max: 0.008061 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001458 | Grad Max: 0.139209 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026864 | Grad Max: 0.795686 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000330 | Grad Max: 0.007421 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012358 | Grad Max: 0.039266 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000704 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002814 | Grad Max: 0.006310 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000321 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000961 | Grad Max: 0.002648 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002163 | Grad Max: 0.004443 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031197 | Grad Max: 0.031197 [GRADIENT NORM TOTAL] 3.8051 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.186 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51442754 0.48557252] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 126/1922 | B: 214/1834 | C: 270/1778 [LOSS Ex1] A: 0.67849 | B: 0.67886 | C: 0.67325 [LOGITS Ex2 A] Mean Abs: 1.195 | Max: 5.000 [LOSS Ex2] A: 0.29214 | B: 0.45236 | C: 0.37967 ** [JOINT LOSS] ** : 1.051593 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004808 | Grad Max: 0.132107 -> Layer: shared_layers.0.bias | Grad Mean: 0.337121 | Grad Max: 1.720214 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001913 | Grad Max: 0.008457 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000963 | Grad Max: 0.000963 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002395 | Grad Max: 0.232237 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045513 | Grad Max: 1.311239 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000558 | Grad Max: 0.012616 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021171 | Grad Max: 0.070996 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001068 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004756 | Grad Max: 0.011058 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000466 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001628 | Grad Max: 0.003907 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003379 | Grad Max: 0.006451 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051518 | Grad Max: 0.051518 [GRADIENT NORM TOTAL] 6.7107 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.290 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.556142 0.44385803] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 157/1891 | B: 244/1804 | C: 251/1797 [LOSS Ex1] A: 0.67710 | B: 0.67926 | C: 0.67424 [LOGITS Ex2 A] Mean Abs: 1.261 | Max: 5.121 [LOSS Ex2] A: 0.27393 | B: 0.42432 | C: 0.37446 ** [JOINT LOSS] ** : 1.034434 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.069340 -> Layer: shared_layers.0.bias | Grad Mean: 0.147570 | Grad Max: 0.785543 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001995 | Grad Max: 0.009087 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011036 | Grad Max: 0.011036 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001077 | Grad Max: 0.073411 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020260 | Grad Max: 0.401177 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000252 | Grad Max: 0.006890 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009593 | Grad Max: 0.037797 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000538 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002163 | Grad Max: 0.005308 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000275 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000739 | Grad Max: 0.002228 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001614 | Grad Max: 0.003750 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024025 | Grad Max: 0.024025 [GRADIENT NORM TOTAL] 2.9453 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.324 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5032351 0.49676493] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.018 [MASKS] A(Pass/Fail): 153/1895 | B: 234/1814 | C: 248/1800 [LOSS Ex1] A: 0.67937 | B: 0.67695 | C: 0.67469 [LOGITS Ex2 A] Mean Abs: 1.293 | Max: 5.254 [LOSS Ex2] A: 0.28026 | B: 0.41383 | C: 0.37032 ** [JOINT LOSS] ** : 1.031806 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003040 | Grad Max: 0.082154 -> Layer: shared_layers.0.bias | Grad Mean: 0.191184 | Grad Max: 1.022766 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001955 | Grad Max: 0.008594 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010646 | Grad Max: 0.010646 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001388 | Grad Max: 0.091581 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026708 | Grad Max: 0.524436 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000329 | Grad Max: 0.009720 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012561 | Grad Max: 0.049560 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000705 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002812 | Grad Max: 0.006716 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000312 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000965 | Grad Max: 0.002454 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001848 | Grad Max: 0.004383 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030538 | Grad Max: 0.030538 [GRADIENT NORM TOTAL] 3.7632 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.148 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5534702 0.44652978] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.018 [MASKS] A(Pass/Fail): 155/1893 | B: 219/1637 | C: 256/1792 [LOSS Ex1] A: 0.67801 | B: 0.67917 | C: 0.67536 [LOGITS Ex2 A] Mean Abs: 1.306 | Max: 4.832 [LOSS Ex2] A: 0.29982 | B: 0.42016 | C: 0.38436 ** [JOINT LOSS] ** : 1.045624 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005210 | Grad Max: 0.121412 -> Layer: shared_layers.0.bias | Grad Mean: 0.268739 | Grad Max: 1.426693 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001828 | Grad Max: 0.008178 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003450 | Grad Max: 0.003450 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001986 | Grad Max: 0.132098 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037480 | Grad Max: 0.744844 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000455 | Grad Max: 0.011259 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017174 | Grad Max: 0.054558 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000832 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003873 | Grad Max: 0.008540 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000412 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001326 | Grad Max: 0.003564 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002818 | Grad Max: 0.005579 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042915 | Grad Max: 0.042915 [GRADIENT NORM TOTAL] 5.2243 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.198 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5318717 0.46812835] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 121/1495 | B: 217/1831 | C: 271/1777 [LOSS Ex1] A: 0.67713 | B: 0.67877 | C: 0.67296 [LOGITS Ex2 A] Mean Abs: 1.329 | Max: 4.906 [LOSS Ex2] A: 0.27423 | B: 0.43251 | C: 0.35252 ** [JOINT LOSS] ** : 1.029372 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001758 | Grad Max: 0.037446 -> Layer: shared_layers.0.bias | Grad Mean: 0.070263 | Grad Max: 0.264036 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001989 | Grad Max: 0.008802 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002424 | Grad Max: 0.002424 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000533 | Grad Max: 0.067574 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009824 | Grad Max: 0.385452 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000105 | Grad Max: 0.004154 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003939 | Grad Max: 0.019615 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000317 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000890 | Grad Max: 0.002630 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000138 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000305 | Grad Max: 0.001047 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000592 | Grad Max: 0.002172 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009298 | Grad Max: 0.009298 [GRADIENT NORM TOTAL] 1.4018 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.325 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080243 0.49197572] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 177/1871 | B: 246/1802 | C: 245/1803 [LOSS Ex1] A: 0.67740 | B: 0.67917 | C: 0.67422 [LOGITS Ex2 A] Mean Abs: 1.295 | Max: 5.370 [LOSS Ex2] A: 0.28497 | B: 0.43303 | C: 0.37665 ** [JOINT LOSS] ** : 1.041811 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004953 | Grad Max: 0.130297 -> Layer: shared_layers.0.bias | Grad Mean: 0.262305 | Grad Max: 1.391007 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001913 | Grad Max: 0.008214 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003003 | Grad Max: 0.003003 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.136117 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037250 | Grad Max: 0.771709 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000467 | Grad Max: 0.011469 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017683 | Grad Max: 0.062424 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000838 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003978 | Grad Max: 0.009090 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000391 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001355 | Grad Max: 0.003347 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002786 | Grad Max: 0.004970 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042893 | Grad Max: 0.042893 [GRADIENT NORM TOTAL] 5.1481 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.307 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5020337 0.4979663] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 159/1889 | B: 236/1812 | C: 250/1798 [LOSS Ex1] A: 0.67674 | B: 0.67685 | C: 0.67423 [LOGITS Ex2 A] Mean Abs: 1.271 | Max: 4.991 [LOSS Ex2] A: 0.29415 | B: 0.42358 | C: 0.38651 ** [JOINT LOSS] ** : 1.044018 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005549 | Grad Max: 0.123255 -> Layer: shared_layers.0.bias | Grad Mean: 0.305819 | Grad Max: 1.548545 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.009433 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011216 | Grad Max: 0.011216 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.152736 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041791 | Grad Max: 0.869083 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000517 | Grad Max: 0.011594 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019504 | Grad Max: 0.068430 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000974 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004400 | Grad Max: 0.009969 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000471 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001506 | Grad Max: 0.003678 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003192 | Grad Max: 0.006363 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048509 | Grad Max: 0.048509 [GRADIENT NORM TOTAL] 5.8589 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.315 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5046567 0.4953433] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 166/1882 | B: 221/1635 | C: 232/1816 [LOSS Ex1] A: 0.67585 | B: 0.67907 | C: 0.67592 [LOGITS Ex2 A] Mean Abs: 1.259 | Max: 5.261 [LOSS Ex2] A: 0.29228 | B: 0.40103 | C: 0.37010 ** [JOINT LOSS] ** : 1.031420 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002605 | Grad Max: 0.064743 -> Layer: shared_layers.0.bias | Grad Mean: 0.151883 | Grad Max: 0.785398 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.010076 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014324 | Grad Max: 0.014324 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001064 | Grad Max: 0.087506 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019626 | Grad Max: 0.490420 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000242 | Grad Max: 0.006747 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009201 | Grad Max: 0.037587 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000558 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002058 | Grad Max: 0.005414 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000240 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000690 | Grad Max: 0.001868 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001396 | Grad Max: 0.003336 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021301 | Grad Max: 0.021301 [GRADIENT NORM TOTAL] 2.8982 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.270 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074064 0.4925936] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 149/1899 | B: 219/1829 | C: 253/1795 [LOSS Ex1] A: 0.67878 | B: 0.67867 | C: 0.67464 [LOGITS Ex2 A] Mean Abs: 1.289 | Max: 5.101 [LOSS Ex2] A: 0.28741 | B: 0.45646 | C: 0.36895 ** [JOINT LOSS] ** : 1.048304 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004453 | Grad Max: 0.104131 -> Layer: shared_layers.0.bias | Grad Mean: 0.255662 | Grad Max: 1.351923 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001869 | Grad Max: 0.008069 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006867 | Grad Max: 0.006867 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001877 | Grad Max: 0.122769 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035453 | Grad Max: 0.694595 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000438 | Grad Max: 0.011523 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016658 | Grad Max: 0.067397 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000932 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003748 | Grad Max: 0.009666 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000403 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001277 | Grad Max: 0.003237 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002454 | Grad Max: 0.005039 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039881 | Grad Max: 0.039881 [GRADIENT NORM TOTAL] 5.0237 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.044 | Max: 0.188 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5146518 0.48534822] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.017 [MASKS] A(Pass/Fail): 138/1910 | B: 248/1800 | C: 164/1212 [LOSS Ex1] A: 0.67832 | B: 0.67907 | C: 0.67653 [LOGITS Ex2 A] Mean Abs: 1.292 | Max: 5.172 [LOSS Ex2] A: 0.30018 | B: 0.43565 | C: 0.38199 ** [JOINT LOSS] ** : 1.050584 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006964 | Grad Max: 0.180701 -> Layer: shared_layers.0.bias | Grad Mean: 0.330869 | Grad Max: 1.743767 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001786 | Grad Max: 0.008078 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004071 | Grad Max: 0.004071 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002487 | Grad Max: 0.179190 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046922 | Grad Max: 1.034956 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000574 | Grad Max: 0.014398 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021630 | Grad Max: 0.077025 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001172 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004871 | Grad Max: 0.011575 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000483 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001655 | Grad Max: 0.004059 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003360 | Grad Max: 0.006316 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051533 | Grad Max: 0.051533 [GRADIENT NORM TOTAL] 6.5052 [EPOCH SUMMARY] Train Loss: 1.0418 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0149 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0164 -> New: 1.0149) ############################## EPOCH 38/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.293 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5570397 0.44296032] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 171/1877 | B: 237/1811 | C: 270/1778 [LOSS Ex1] A: 0.67689 | B: 0.67674 | C: 0.67368 [LOGITS Ex2 A] Mean Abs: 1.319 | Max: 5.317 [LOSS Ex2] A: 0.27961 | B: 0.42083 | C: 0.37828 ** [JOINT LOSS] ** : 1.035342 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004073 | Grad Max: 0.101231 -> Layer: shared_layers.0.bias | Grad Mean: 0.243084 | Grad Max: 1.290912 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.009686 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014463 | Grad Max: 0.014463 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001764 | Grad Max: 0.149091 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033541 | Grad Max: 0.841282 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000419 | Grad Max: 0.010714 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015832 | Grad Max: 0.060737 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000866 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003531 | Grad Max: 0.008568 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000401 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001210 | Grad Max: 0.002987 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002486 | Grad Max: 0.005244 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038863 | Grad Max: 0.038863 [GRADIENT NORM TOTAL] 4.8480 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.327 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50317013 0.49682987] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 160/1888 | B: 222/1634 | C: 265/1783 [LOSS Ex1] A: 0.67919 | B: 0.67898 | C: 0.67320 [LOGITS Ex2 A] Mean Abs: 1.279 | Max: 5.349 [LOSS Ex2] A: 0.27753 | B: 0.40114 | C: 0.39298 ** [JOINT LOSS] ** : 1.034339 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003465 | Grad Max: 0.119895 -> Layer: shared_layers.0.bias | Grad Mean: 0.119797 | Grad Max: 0.600078 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001869 | Grad Max: 0.008536 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008002 | Grad Max: 0.008002 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001001 | Grad Max: 0.057558 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017911 | Grad Max: 0.326519 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.005331 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007701 | Grad Max: 0.027036 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000488 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001794 | Grad Max: 0.004626 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000233 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000618 | Grad Max: 0.001730 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001473 | Grad Max: 0.003209 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020775 | Grad Max: 0.020775 [GRADIENT NORM TOTAL] 2.3561 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.150 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5542421 0.44575796] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 167/1881 | B: 221/1827 | C: 252/1796 [LOSS Ex1] A: 0.67781 | B: 0.67858 | C: 0.67513 [LOGITS Ex2 A] Mean Abs: 1.273 | Max: 5.531 [LOSS Ex2] A: 0.28893 | B: 0.44438 | C: 0.37887 ** [JOINT LOSS] ** : 1.047905 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003789 | Grad Max: 0.094703 -> Layer: shared_layers.0.bias | Grad Mean: 0.211983 | Grad Max: 1.124358 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001879 | Grad Max: 0.008484 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005563 | Grad Max: 0.005563 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001642 | Grad Max: 0.097210 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030942 | Grad Max: 0.561610 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000378 | Grad Max: 0.008144 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014323 | Grad Max: 0.048248 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000837 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003233 | Grad Max: 0.007964 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000349 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001105 | Grad Max: 0.002845 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002391 | Grad Max: 0.004793 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035773 | Grad Max: 0.035773 [GRADIENT NORM TOTAL] 4.2508 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.200 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5323924 0.46760762] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 132/1484 | B: 249/1799 | C: 258/1790 [LOSS Ex1] A: 0.67693 | B: 0.67898 | C: 0.67488 [LOGITS Ex2 A] Mean Abs: 1.320 | Max: 5.895 [LOSS Ex2] A: 0.27477 | B: 0.42279 | C: 0.35313 ** [JOINT LOSS] ** : 1.027163 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002274 | Grad Max: 0.052841 -> Layer: shared_layers.0.bias | Grad Mean: 0.087463 | Grad Max: 0.489496 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.008643 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004136 | Grad Max: 0.004136 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000722 | Grad Max: 0.043029 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013431 | Grad Max: 0.242410 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000169 | Grad Max: 0.004458 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006364 | Grad Max: 0.023699 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000434 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001432 | Grad Max: 0.003651 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000187 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000477 | Grad Max: 0.001500 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001141 | Grad Max: 0.003184 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015560 | Grad Max: 0.015560 [GRADIENT NORM TOTAL] 1.7995 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.328 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5079692 0.4920308] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 186/1862 | B: 240/1808 | C: 229/1819 [LOSS Ex1] A: 0.67720 | B: 0.67664 | C: 0.67559 [LOGITS Ex2 A] Mean Abs: 1.346 | Max: 6.928 [LOSS Ex2] A: 0.30602 | B: 0.42847 | C: 0.36728 ** [JOINT LOSS] ** : 1.043736 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006215 | Grad Max: 0.149922 -> Layer: shared_layers.0.bias | Grad Mean: 0.330675 | Grad Max: 1.732458 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001994 | Grad Max: 0.009068 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010415 | Grad Max: 0.010415 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002436 | Grad Max: 0.164000 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046195 | Grad Max: 0.920168 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000554 | Grad Max: 0.012689 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021004 | Grad Max: 0.073064 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001138 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004737 | Grad Max: 0.010976 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000509 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001619 | Grad Max: 0.003945 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003311 | Grad Max: 0.006266 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051502 | Grad Max: 0.051502 [GRADIENT NORM TOTAL] 6.4764 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.310 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5021597 0.49784032] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 174/1874 | B: 224/1632 | C: 241/1807 [LOSS Ex1] A: 0.67653 | B: 0.67888 | C: 0.67384 [LOGITS Ex2 A] Mean Abs: 1.346 | Max: 5.258 [LOSS Ex2] A: 0.30622 | B: 0.42884 | C: 0.38105 ** [JOINT LOSS] ** : 1.048453 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008243 | Grad Max: 0.201307 -> Layer: shared_layers.0.bias | Grad Mean: 0.419588 | Grad Max: 2.288453 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002009 | Grad Max: 0.009343 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010372 | Grad Max: 0.010372 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003204 | Grad Max: 0.208186 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060903 | Grad Max: 1.140034 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000741 | Grad Max: 0.017857 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028110 | Grad Max: 0.101718 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000118 | Grad Max: 0.001316 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006324 | Grad Max: 0.013664 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.000649 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002152 | Grad Max: 0.005208 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004473 | Grad Max: 0.008056 -> Layer: exit2_layers.12.bias | Grad Mean: 0.068655 | Grad Max: 0.068655 [GRADIENT NORM TOTAL] 8.3755 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.318 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50456184 0.49543813] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 176/1872 | B: 221/1827 | C: 257/1791 [LOSS Ex1] A: 0.67564 | B: 0.67848 | C: 0.67515 [LOGITS Ex2 A] Mean Abs: 1.343 | Max: 6.539 [LOSS Ex2] A: 0.31053 | B: 0.45234 | C: 0.38682 ** [JOINT LOSS] ** : 1.059652 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007379 | Grad Max: 0.204467 -> Layer: shared_layers.0.bias | Grad Mean: 0.307654 | Grad Max: 1.606939 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.009672 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011397 | Grad Max: 0.011397 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002456 | Grad Max: 0.148919 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045764 | Grad Max: 0.850356 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000550 | Grad Max: 0.012197 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020699 | Grad Max: 0.067522 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.001128 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004692 | Grad Max: 0.011279 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000483 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001579 | Grad Max: 0.004035 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003166 | Grad Max: 0.005953 -> Layer: exit2_layers.12.bias | Grad Mean: 0.049357 | Grad Max: 0.049357 [GRADIENT NORM TOTAL] 6.1507 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.272 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073436 0.49265638] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.018 [MASKS] A(Pass/Fail): 157/1891 | B: 249/1799 | C: 246/1802 [LOSS Ex1] A: 0.67861 | B: 0.67889 | C: 0.67380 [LOGITS Ex2 A] Mean Abs: 1.278 | Max: 5.181 [LOSS Ex2] A: 0.28278 | B: 0.42301 | C: 0.37972 ** [JOINT LOSS] ** : 1.038937 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.035486 -> Layer: shared_layers.0.bias | Grad Mean: 0.035075 | Grad Max: 0.241827 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001832 | Grad Max: 0.007252 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002201 | Grad Max: 0.002201 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000336 | Grad Max: 0.038624 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005818 | Grad Max: 0.216635 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.002838 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002460 | Grad Max: 0.011877 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000213 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000553 | Grad Max: 0.002072 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000117 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000195 | Grad Max: 0.000670 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000727 | Grad Max: 0.002156 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008249 | Grad Max: 0.008249 [GRADIENT NORM TOTAL] 0.7841 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.189 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51484597 0.48515403] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.018 [MASKS] A(Pass/Fail): 150/1898 | B: 241/1807 | C: 254/1794 [LOSS Ex1] A: 0.67814 | B: 0.67654 | C: 0.67427 [LOGITS Ex2 A] Mean Abs: 1.224 | Max: 5.174 [LOSS Ex2] A: 0.30574 | B: 0.41341 | C: 0.37008 ** [JOINT LOSS] ** : 1.039396 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003446 | Grad Max: 0.074239 -> Layer: shared_layers.0.bias | Grad Mean: 0.190360 | Grad Max: 0.922417 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001959 | Grad Max: 0.008575 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004917 | Grad Max: 0.004917 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001356 | Grad Max: 0.172356 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025350 | Grad Max: 0.974774 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.007746 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011393 | Grad Max: 0.045577 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000728 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002562 | Grad Max: 0.006627 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000298 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000857 | Grad Max: 0.002328 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001783 | Grad Max: 0.003972 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026973 | Grad Max: 0.026973 [GRADIENT NORM TOTAL] 3.8029 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.296 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55797577 0.44202417] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 185/1863 | B: 227/1629 | C: 244/1804 [LOSS Ex1] A: 0.67668 | B: 0.67879 | C: 0.67464 [LOGITS Ex2 A] Mean Abs: 1.276 | Max: 4.977 [LOSS Ex2] A: 0.27479 | B: 0.40557 | C: 0.36399 ** [JOINT LOSS] ** : 1.024820 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001632 | Grad Max: 0.032204 -> Layer: shared_layers.0.bias | Grad Mean: 0.087561 | Grad Max: 0.406590 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.009585 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018918 | Grad Max: 0.018918 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000628 | Grad Max: 0.113106 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011073 | Grad Max: 0.639460 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.003857 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004368 | Grad Max: 0.019942 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000294 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000955 | Grad Max: 0.002853 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000144 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000309 | Grad Max: 0.001187 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000692 | Grad Max: 0.002266 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008864 | Grad Max: 0.008864 [GRADIENT NORM TOTAL] 1.8334 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.330 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50311506 0.49688497] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 170/1878 | B: 222/1826 | C: 257/1791 [LOSS Ex1] A: 0.67901 | B: 0.67837 | C: 0.67400 [LOGITS Ex2 A] Mean Abs: 1.315 | Max: 4.740 [LOSS Ex2] A: 0.27249 | B: 0.43941 | C: 0.35633 ** [JOINT LOSS] ** : 1.033203 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003786 | Grad Max: 0.095272 -> Layer: shared_layers.0.bias | Grad Mean: 0.160657 | Grad Max: 0.898239 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001955 | Grad Max: 0.008843 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013019 | Grad Max: 0.013019 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001214 | Grad Max: 0.071034 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023418 | Grad Max: 0.390273 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000288 | Grad Max: 0.007450 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010925 | Grad Max: 0.036798 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000679 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002434 | Grad Max: 0.006004 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000292 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000827 | Grad Max: 0.002150 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001582 | Grad Max: 0.003799 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026459 | Grad Max: 0.026459 [GRADIENT NORM TOTAL] 3.1570 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.152 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55504173 0.44495824] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 182/1866 | B: 250/1798 | C: 228/1820 [LOSS Ex1] A: 0.67761 | B: 0.67877 | C: 0.67616 [LOGITS Ex2 A] Mean Abs: 1.319 | Max: 5.481 [LOSS Ex2] A: 0.29173 | B: 0.42960 | C: 0.39219 ** [JOINT LOSS] ** : 1.048687 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.050820 -> Layer: shared_layers.0.bias | Grad Mean: 0.082961 | Grad Max: 0.457914 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001804 | Grad Max: 0.007459 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003851 | Grad Max: 0.003851 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000674 | Grad Max: 0.056316 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012000 | Grad Max: 0.297301 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.003841 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004837 | Grad Max: 0.019884 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000313 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001095 | Grad Max: 0.003267 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000155 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000362 | Grad Max: 0.001166 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000549 | Grad Max: 0.001593 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009859 | Grad Max: 0.009859 [GRADIENT NORM TOTAL] 1.6644 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.203 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53288484 0.46711513] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 145/1471 | B: 246/1802 | C: 251/1797 [LOSS Ex1] A: 0.67671 | B: 0.67641 | C: 0.67383 [LOGITS Ex2 A] Mean Abs: 1.333 | Max: 5.787 [LOSS Ex2] A: 0.27706 | B: 0.41767 | C: 0.35579 ** [JOINT LOSS] ** : 1.025823 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003144 | Grad Max: 0.078857 -> Layer: shared_layers.0.bias | Grad Mean: 0.138267 | Grad Max: 0.705285 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.009274 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005373 | Grad Max: 0.005373 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001049 | Grad Max: 0.072856 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019143 | Grad Max: 0.410059 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000227 | Grad Max: 0.006169 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008558 | Grad Max: 0.030934 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000478 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001958 | Grad Max: 0.004485 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000228 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000659 | Grad Max: 0.001677 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001498 | Grad Max: 0.003391 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021473 | Grad Max: 0.021473 [GRADIENT NORM TOTAL] 2.7265 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.331 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50797945 0.49202058] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 195/1853 | B: 227/1629 | C: 175/1201 [LOSS Ex1] A: 0.67697 | B: 0.67866 | C: 0.67189 [LOGITS Ex2 A] Mean Abs: 1.329 | Max: 5.634 [LOSS Ex2] A: 0.27567 | B: 0.40664 | C: 0.35761 ** [JOINT LOSS] ** : 1.022477 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003318 | Grad Max: 0.096869 -> Layer: shared_layers.0.bias | Grad Mean: 0.099935 | Grad Max: 0.485266 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002013 | Grad Max: 0.009078 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009701 | Grad Max: 0.009701 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000783 | Grad Max: 0.060454 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014102 | Grad Max: 0.303901 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000164 | Grad Max: 0.004348 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006220 | Grad Max: 0.022694 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000417 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001430 | Grad Max: 0.004019 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000236 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000468 | Grad Max: 0.001670 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000912 | Grad Max: 0.002603 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013557 | Grad Max: 0.013557 [GRADIENT NORM TOTAL] 1.9432 [EPOCH SUMMARY] Train Loss: 1.0379 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0083 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0149 -> New: 1.0083) ############################## EPOCH 39/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.313 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50224197 0.49775803] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 187/1861 | B: 225/1823 | C: 226/1822 [LOSS Ex1] A: 0.67627 | B: 0.67824 | C: 0.67599 [LOGITS Ex2 A] Mean Abs: 1.334 | Max: 4.666 [LOSS Ex2] A: 0.27509 | B: 0.43576 | C: 0.37658 ** [JOINT LOSS] ** : 1.039312 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002823 | Grad Max: 0.070684 -> Layer: shared_layers.0.bias | Grad Mean: 0.151383 | Grad Max: 0.795545 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.009897 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016292 | Grad Max: 0.016292 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001139 | Grad Max: 0.095704 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021748 | Grad Max: 0.532723 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000263 | Grad Max: 0.006787 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010067 | Grad Max: 0.034554 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000527 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002245 | Grad Max: 0.005288 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000254 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000752 | Grad Max: 0.001941 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001342 | Grad Max: 0.003083 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023180 | Grad Max: 0.023180 [GRADIENT NORM TOTAL] 3.0440 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.322 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.504523 0.495477] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 196/1852 | B: 251/1797 | C: 233/1815 [LOSS Ex1] A: 0.67536 | B: 0.67863 | C: 0.67634 [LOGITS Ex2 A] Mean Abs: 1.332 | Max: 5.457 [LOSS Ex2] A: 0.29055 | B: 0.42453 | C: 0.38044 ** [JOINT LOSS] ** : 1.041952 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003809 | Grad Max: 0.107234 -> Layer: shared_layers.0.bias | Grad Mean: 0.128149 | Grad Max: 0.570515 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001985 | Grad Max: 0.009439 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011854 | Grad Max: 0.011854 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001100 | Grad Max: 0.093804 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019932 | Grad Max: 0.530769 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000239 | Grad Max: 0.005832 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008893 | Grad Max: 0.035508 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000511 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002014 | Grad Max: 0.005322 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000230 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000679 | Grad Max: 0.001811 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001268 | Grad Max: 0.003130 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020932 | Grad Max: 0.020932 [GRADIENT NORM TOTAL] 2.6118 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.275 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073177 0.49268225] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.517 | Std: 0.018 [MASKS] A(Pass/Fail): 170/1878 | B: 248/1800 | C: 253/1795 [LOSS Ex1] A: 0.67837 | B: 0.67626 | C: 0.67374 [LOGITS Ex2 A] Mean Abs: 1.279 | Max: 5.051 [LOSS Ex2] A: 0.27809 | B: 0.41300 | C: 0.37414 ** [JOINT LOSS] ** : 1.031201 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003386 | Grad Max: 0.084104 -> Layer: shared_layers.0.bias | Grad Mean: 0.132841 | Grad Max: 0.775164 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001927 | Grad Max: 0.007288 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004024 | Grad Max: 0.004024 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001070 | Grad Max: 0.107776 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019810 | Grad Max: 0.606468 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000239 | Grad Max: 0.005832 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009129 | Grad Max: 0.035020 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000548 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002072 | Grad Max: 0.005479 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000245 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000690 | Grad Max: 0.001956 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001510 | Grad Max: 0.003633 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021906 | Grad Max: 0.021906 [GRADIENT NORM TOTAL] 2.7918 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.045 | Max: 0.191 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5151163 0.48488367] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.517 | Std: 0.018 [MASKS] A(Pass/Fail): 160/1888 | B: 230/1626 | C: 239/1809 [LOSS Ex1] A: 0.67790 | B: 0.67852 | C: 0.67406 [LOGITS Ex2 A] Mean Abs: 1.262 | Max: 5.598 [LOSS Ex2] A: 0.28214 | B: 0.40254 | C: 0.37509 ** [JOINT LOSS] ** : 1.030082 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003702 | Grad Max: 0.078339 -> Layer: shared_layers.0.bias | Grad Mean: 0.168003 | Grad Max: 0.863160 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001818 | Grad Max: 0.007666 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002338 | Grad Max: 0.002338 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001291 | Grad Max: 0.088398 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024066 | Grad Max: 0.508183 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000292 | Grad Max: 0.006895 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011086 | Grad Max: 0.039754 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000702 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002497 | Grad Max: 0.006576 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000291 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000828 | Grad Max: 0.002134 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001756 | Grad Max: 0.003916 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026002 | Grad Max: 0.026002 [GRADIENT NORM TOTAL] 3.3051 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.299 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5592191 0.4407809] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.019 [MASKS] A(Pass/Fail): 197/1851 | B: 231/1817 | C: 263/1785 [LOSS Ex1] A: 0.67639 | B: 0.67810 | C: 0.67243 [LOGITS Ex2 A] Mean Abs: 1.328 | Max: 5.146 [LOSS Ex2] A: 0.27583 | B: 0.43232 | C: 0.34470 ** [JOINT LOSS] ** : 1.026587 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003042 | Grad Max: 0.072090 -> Layer: shared_layers.0.bias | Grad Mean: 0.129853 | Grad Max: 0.629956 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.008966 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010019 | Grad Max: 0.010019 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000962 | Grad Max: 0.098994 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017834 | Grad Max: 0.534929 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000215 | Grad Max: 0.005735 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008163 | Grad Max: 0.028009 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000463 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001831 | Grad Max: 0.004541 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000220 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000619 | Grad Max: 0.001755 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001158 | Grad Max: 0.003179 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019265 | Grad Max: 0.019265 [GRADIENT NORM TOTAL] 2.5507 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.334 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5030554 0.49694455] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 176/1872 | B: 255/1793 | C: 272/1776 [LOSS Ex1] A: 0.67876 | B: 0.67849 | C: 0.67318 [LOGITS Ex2 A] Mean Abs: 1.325 | Max: 5.376 [LOSS Ex2] A: 0.26526 | B: 0.42764 | C: 0.35864 ** [JOINT LOSS] ** : 1.027328 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002076 | Grad Max: 0.052877 -> Layer: shared_layers.0.bias | Grad Mean: 0.136197 | Grad Max: 0.713654 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001998 | Grad Max: 0.008819 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014646 | Grad Max: 0.014646 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000970 | Grad Max: 0.067131 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018085 | Grad Max: 0.378815 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000228 | Grad Max: 0.006062 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008688 | Grad Max: 0.035410 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000479 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001894 | Grad Max: 0.004600 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000214 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000636 | Grad Max: 0.001757 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001155 | Grad Max: 0.003683 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019926 | Grad Max: 0.019926 [GRADIENT NORM TOTAL] 2.6737 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.155 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5561235 0.4438765] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 204/1844 | B: 250/1798 | C: 257/1791 [LOSS Ex1] A: 0.67733 | B: 0.67611 | C: 0.67367 [LOGITS Ex2 A] Mean Abs: 1.302 | Max: 5.325 [LOSS Ex2] A: 0.28020 | B: 0.40738 | C: 0.38627 ** [JOINT LOSS] ** : 1.033653 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002318 | Grad Max: 0.051602 -> Layer: shared_layers.0.bias | Grad Mean: 0.130000 | Grad Max: 0.642338 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002006 | Grad Max: 0.009074 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007045 | Grad Max: 0.007045 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000929 | Grad Max: 0.072907 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017266 | Grad Max: 0.405899 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.005603 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007816 | Grad Max: 0.029030 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000424 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001757 | Grad Max: 0.004262 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000225 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000586 | Grad Max: 0.001694 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001206 | Grad Max: 0.002736 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018335 | Grad Max: 0.018335 [GRADIENT NORM TOTAL] 2.5181 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.207 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5335658 0.46643415] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.019 [MASKS] A(Pass/Fail): 153/1463 | B: 231/1625 | C: 250/1798 [LOSS Ex1] A: 0.67643 | B: 0.67838 | C: 0.67334 [LOGITS Ex2 A] Mean Abs: 1.336 | Max: 5.641 [LOSS Ex2] A: 0.27065 | B: 0.40573 | C: 0.36689 ** [JOINT LOSS] ** : 1.023805 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002827 | Grad Max: 0.069330 -> Layer: shared_layers.0.bias | Grad Mean: 0.160580 | Grad Max: 0.747793 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001942 | Grad Max: 0.008686 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004169 | Grad Max: 0.004169 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001129 | Grad Max: 0.096703 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020838 | Grad Max: 0.554275 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000250 | Grad Max: 0.006250 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009475 | Grad Max: 0.032921 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000528 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002130 | Grad Max: 0.004940 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000269 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000702 | Grad Max: 0.002112 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001455 | Grad Max: 0.003946 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021690 | Grad Max: 0.021690 [GRADIENT NORM TOTAL] 3.1087 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.335 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5079225 0.49207756] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 211/1837 | B: 234/1814 | C: 262/1786 [LOSS Ex1] A: 0.67668 | B: 0.67796 | C: 0.67299 [LOGITS Ex2 A] Mean Abs: 1.347 | Max: 6.787 [LOSS Ex2] A: 0.27412 | B: 0.43545 | C: 0.36109 ** [JOINT LOSS] ** : 1.032762 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002956 | Grad Max: 0.085333 -> Layer: shared_layers.0.bias | Grad Mean: 0.105309 | Grad Max: 0.498212 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001937 | Grad Max: 0.008435 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003517 | Grad Max: 0.003518 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000810 | Grad Max: 0.068471 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014803 | Grad Max: 0.373996 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000176 | Grad Max: 0.005593 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006659 | Grad Max: 0.025481 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000463 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001507 | Grad Max: 0.004112 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000501 | Grad Max: 0.001455 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000884 | Grad Max: 0.002716 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015134 | Grad Max: 0.015134 [GRADIENT NORM TOTAL] 2.0248 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.317 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023615 0.49763855] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 200/1848 | B: 257/1791 | C: 279/1769 [LOSS Ex1] A: 0.67597 | B: 0.67835 | C: 0.67149 [LOGITS Ex2 A] Mean Abs: 1.337 | Max: 5.756 [LOSS Ex2] A: 0.27178 | B: 0.42632 | C: 0.34870 ** [JOINT LOSS] ** : 1.024201 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003002 | Grad Max: 0.076742 -> Layer: shared_layers.0.bias | Grad Mean: 0.098808 | Grad Max: 0.459054 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.009253 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010470 | Grad Max: 0.010470 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000757 | Grad Max: 0.050561 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013559 | Grad Max: 0.273367 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000154 | Grad Max: 0.004251 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005837 | Grad Max: 0.020700 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000376 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001340 | Grad Max: 0.003294 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000173 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000456 | Grad Max: 0.001346 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000809 | Grad Max: 0.002823 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013761 | Grad Max: 0.013761 [GRADIENT NORM TOTAL] 1.9296 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.326 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50444806 0.49555194] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 204/1844 | B: 253/1795 | C: 258/1790 [LOSS Ex1] A: 0.67505 | B: 0.67596 | C: 0.67421 [LOGITS Ex2 A] Mean Abs: 1.320 | Max: 5.917 [LOSS Ex2] A: 0.28902 | B: 0.41350 | C: 0.36174 ** [JOINT LOSS] ** : 1.029824 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.068181 -> Layer: shared_layers.0.bias | Grad Mean: 0.161142 | Grad Max: 0.778228 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.009773 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011687 | Grad Max: 0.011687 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001090 | Grad Max: 0.088751 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020233 | Grad Max: 0.495700 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.007143 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009427 | Grad Max: 0.036998 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000529 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002078 | Grad Max: 0.004738 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000244 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000693 | Grad Max: 0.002006 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001421 | Grad Max: 0.003137 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022162 | Grad Max: 0.022162 [GRADIENT NORM TOTAL] 3.0677 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.279 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50725543 0.49274454] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 189/1859 | B: 234/1622 | C: 245/1803 [LOSS Ex1] A: 0.67811 | B: 0.67824 | C: 0.67287 [LOGITS Ex2 A] Mean Abs: 1.313 | Max: 5.323 [LOSS Ex2] A: 0.26815 | B: 0.40779 | C: 0.35330 ** [JOINT LOSS] ** : 1.019487 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004892 | Grad Max: 0.147861 -> Layer: shared_layers.0.bias | Grad Mean: 0.208100 | Grad Max: 1.011533 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001864 | Grad Max: 0.007929 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002693 | Grad Max: 0.002693 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001577 | Grad Max: 0.119885 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029085 | Grad Max: 0.664113 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000356 | Grad Max: 0.009383 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013409 | Grad Max: 0.048520 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000778 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003020 | Grad Max: 0.008217 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000330 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000994 | Grad Max: 0.002641 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002164 | Grad Max: 0.004551 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031013 | Grad Max: 0.031013 [GRADIENT NORM TOTAL] 3.9611 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.046 | Max: 0.193 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5153727 0.48462728] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.018 [MASKS] A(Pass/Fail): 173/1875 | B: 238/1810 | C: 226/1822 [LOSS Ex1] A: 0.67764 | B: 0.67781 | C: 0.67718 [LOGITS Ex2 A] Mean Abs: 1.301 | Max: 5.165 [LOSS Ex2] A: 0.28373 | B: 0.43932 | C: 0.38299 ** [JOINT LOSS] ** : 1.046224 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.042228 -> Layer: shared_layers.0.bias | Grad Mean: 0.094263 | Grad Max: 0.498553 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001856 | Grad Max: 0.008547 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010012 | Grad Max: 0.010012 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000691 | Grad Max: 0.054142 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012557 | Grad Max: 0.287896 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000137 | Grad Max: 0.004145 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005252 | Grad Max: 0.021414 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000391 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001117 | Grad Max: 0.003264 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000152 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000361 | Grad Max: 0.001068 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000608 | Grad Max: 0.001915 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010710 | Grad Max: 0.010710 [GRADIENT NORM TOTAL] 1.8484 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.303 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5605643 0.43943572] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.019 [MASKS] A(Pass/Fail): 210/1838 | B: 265/1783 | C: 175/1201 [LOSS Ex1] A: 0.67608 | B: 0.67821 | C: 0.67278 [LOGITS Ex2 A] Mean Abs: 1.345 | Max: 5.248 [LOSS Ex2] A: 0.26813 | B: 0.42030 | C: 0.36271 ** [JOINT LOSS] ** : 1.026072 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001733 | Grad Max: 0.034189 -> Layer: shared_layers.0.bias | Grad Mean: 0.095689 | Grad Max: 0.505889 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.008881 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008561 | Grad Max: 0.008561 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000709 | Grad Max: 0.076884 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012992 | Grad Max: 0.431158 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000145 | Grad Max: 0.005542 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005679 | Grad Max: 0.025151 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000399 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001268 | Grad Max: 0.004067 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000178 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000430 | Grad Max: 0.001219 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000754 | Grad Max: 0.002647 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012820 | Grad Max: 0.012820 [GRADIENT NORM TOTAL] 1.9602 [EPOCH SUMMARY] Train Loss: 1.0309 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0028 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0083 -> New: 1.0028) ############################## EPOCH 40/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.338 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029471 0.4970529] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 197/1851 | B: 262/1786 | C: 268/1780 [LOSS Ex1] A: 0.67849 | B: 0.67581 | C: 0.67186 [LOGITS Ex2 A] Mean Abs: 1.335 | Max: 4.911 [LOSS Ex2] A: 0.26524 | B: 0.40441 | C: 0.36721 ** [JOINT LOSS] ** : 1.021010 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004468 | Grad Max: 0.112827 -> Layer: shared_layers.0.bias | Grad Mean: 0.116551 | Grad Max: 0.534937 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001963 | Grad Max: 0.007887 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003120 | Grad Max: 0.003120 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000883 | Grad Max: 0.073040 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015528 | Grad Max: 0.390389 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000186 | Grad Max: 0.004435 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006876 | Grad Max: 0.025507 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000392 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001590 | Grad Max: 0.004286 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000204 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000524 | Grad Max: 0.001581 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001285 | Grad Max: 0.003257 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016926 | Grad Max: 0.016926 [GRADIENT NORM TOTAL] 2.1795 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.158 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.557271 0.44272906] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 215/1833 | B: 237/1619 | C: 251/1797 [LOSS Ex1] A: 0.67704 | B: 0.67810 | C: 0.67541 [LOGITS Ex2 A] Mean Abs: 1.348 | Max: 5.340 [LOSS Ex2] A: 0.27641 | B: 0.39672 | C: 0.36477 ** [JOINT LOSS] ** : 1.022821 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001843 | Grad Max: 0.032823 -> Layer: shared_layers.0.bias | Grad Mean: 0.037665 | Grad Max: 0.220912 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.009018 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011797 | Grad Max: 0.011797 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000318 | Grad Max: 0.042752 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005202 | Grad Max: 0.223902 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.002454 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001616 | Grad Max: 0.010153 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000260 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000329 | Grad Max: 0.001816 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000084 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000104 | Grad Max: 0.000509 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000495 | Grad Max: 0.001693 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003000 | Grad Max: 0.003000 [GRADIENT NORM TOTAL] 0.7658 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.211 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5343359 0.46566406] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.019 [MASKS] A(Pass/Fail): 175/1441 | B: 257/1791 | C: 262/1786 [LOSS Ex1] A: 0.67613 | B: 0.67766 | C: 0.67325 [LOGITS Ex2 A] Mean Abs: 1.409 | Max: 6.849 [LOSS Ex2] A: 0.26367 | B: 0.43860 | C: 0.39308 ** [JOINT LOSS] ** : 1.040795 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003724 | Grad Max: 0.107392 -> Layer: shared_layers.0.bias | Grad Mean: 0.130072 | Grad Max: 0.653533 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001960 | Grad Max: 0.008547 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000017 | Grad Max: 0.000017 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001021 | Grad Max: 0.105917 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019000 | Grad Max: 0.604004 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000211 | Grad Max: 0.005802 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007981 | Grad Max: 0.031555 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000432 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001775 | Grad Max: 0.004620 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000220 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000573 | Grad Max: 0.001749 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000988 | Grad Max: 0.002204 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016040 | Grad Max: 0.016040 [GRADIENT NORM TOTAL] 2.5779 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.340 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078193 0.4921807] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 235/1813 | B: 283/1765 | C: 267/1781 [LOSS Ex1] A: 0.67637 | B: 0.67806 | C: 0.67133 [LOGITS Ex2 A] Mean Abs: 1.380 | Max: 6.020 [LOSS Ex2] A: 0.26922 | B: 0.41743 | C: 0.37111 ** [JOINT LOSS] ** : 1.027839 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002370 | Grad Max: 0.057503 -> Layer: shared_layers.0.bias | Grad Mean: 0.058253 | Grad Max: 0.247137 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.008995 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008818 | Grad Max: 0.008818 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000455 | Grad Max: 0.049657 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008111 | Grad Max: 0.270929 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.003851 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003200 | Grad Max: 0.016477 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000289 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000732 | Grad Max: 0.002326 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000140 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000239 | Grad Max: 0.000986 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000448 | Grad Max: 0.001807 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006737 | Grad Max: 0.006737 [GRADIENT NORM TOTAL] 1.1591 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.321 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50257045 0.4974296 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.019 [MASKS] A(Pass/Fail): 226/1822 | B: 281/1767 | C: 225/1823 [LOSS Ex1] A: 0.67562 | B: 0.67564 | C: 0.67537 [LOGITS Ex2 A] Mean Abs: 1.352 | Max: 5.255 [LOSS Ex2] A: 0.26527 | B: 0.41613 | C: 0.36251 ** [JOINT LOSS] ** : 1.023507 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004136 | Grad Max: 0.085456 -> Layer: shared_layers.0.bias | Grad Mean: 0.226329 | Grad Max: 1.118514 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.009572 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015711 | Grad Max: 0.015711 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001675 | Grad Max: 0.196294 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031568 | Grad Max: 1.109839 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.010000 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014036 | Grad Max: 0.055863 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000762 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003111 | Grad Max: 0.007622 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000320 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001013 | Grad Max: 0.002510 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002009 | Grad Max: 0.004628 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030483 | Grad Max: 0.030483 [GRADIENT NORM TOTAL] 4.5355 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.330 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50425625 0.49574378] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.019 [MASKS] A(Pass/Fail): 223/1825 | B: 249/1607 | C: 260/1788 [LOSS Ex1] A: 0.67470 | B: 0.67793 | C: 0.67309 [LOGITS Ex2 A] Mean Abs: 1.334 | Max: 5.474 [LOSS Ex2] A: 0.29247 | B: 0.40468 | C: 0.36444 ** [JOINT LOSS] ** : 1.029108 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005170 | Grad Max: 0.112255 -> Layer: shared_layers.0.bias | Grad Mean: 0.293875 | Grad Max: 1.521630 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002013 | Grad Max: 0.008952 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006720 | Grad Max: 0.006720 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002113 | Grad Max: 0.196565 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039829 | Grad Max: 1.100478 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000465 | Grad Max: 0.011420 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017901 | Grad Max: 0.061010 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000927 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003952 | Grad Max: 0.008857 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000404 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001285 | Grad Max: 0.003252 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002510 | Grad Max: 0.004887 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039128 | Grad Max: 0.039128 [GRADIENT NORM TOTAL] 5.7510 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.282 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070765 0.49292347] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 211/1837 | B: 260/1788 | C: 227/1821 [LOSS Ex1] A: 0.67782 | B: 0.67749 | C: 0.67481 [LOGITS Ex2 A] Mean Abs: 1.345 | Max: 5.187 [LOSS Ex2] A: 0.27434 | B: 0.42952 | C: 0.37377 ** [JOINT LOSS] ** : 1.035917 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002226 | Grad Max: 0.064049 -> Layer: shared_layers.0.bias | Grad Mean: 0.054346 | Grad Max: 0.274009 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001821 | Grad Max: 0.007727 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004482 | Grad Max: 0.004482 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000490 | Grad Max: 0.102765 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008460 | Grad Max: 0.580881 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000085 | Grad Max: 0.003336 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003069 | Grad Max: 0.018995 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000242 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000694 | Grad Max: 0.002398 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000113 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000226 | Grad Max: 0.000840 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000548 | Grad Max: 0.001809 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007361 | Grad Max: 0.007361 [GRADIENT NORM TOTAL] 1.2967 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.195 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5158098 0.48419026] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 203/1845 | B: 286/1762 | C: 271/1777 [LOSS Ex1] A: 0.67735 | B: 0.67789 | C: 0.67138 [LOGITS Ex2 A] Mean Abs: 1.364 | Max: 5.437 [LOSS Ex2] A: 0.29598 | B: 0.44064 | C: 0.35815 ** [JOINT LOSS] ** : 1.040463 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006642 | Grad Max: 0.169095 -> Layer: shared_layers.0.bias | Grad Mean: 0.365065 | Grad Max: 1.860129 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001909 | Grad Max: 0.008092 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000394 | Grad Max: 0.000394 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002616 | Grad Max: 0.214155 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049485 | Grad Max: 1.137242 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000600 | Grad Max: 0.013953 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023134 | Grad Max: 0.078732 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001081 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005098 | Grad Max: 0.011175 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000480 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001663 | Grad Max: 0.004183 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003271 | Grad Max: 0.005992 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051198 | Grad Max: 0.051198 [GRADIENT NORM TOTAL] 7.1506 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.308 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5622334 0.4377666] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 236/1812 | B: 283/1765 | C: 242/1806 [LOSS Ex1] A: 0.67573 | B: 0.67546 | C: 0.67400 [LOGITS Ex2 A] Mean Abs: 1.408 | Max: 5.076 [LOSS Ex2] A: 0.27194 | B: 0.44354 | C: 0.37259 ** [JOINT LOSS] ** : 1.037750 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008904 | Grad Max: 0.225678 -> Layer: shared_layers.0.bias | Grad Mean: 0.503962 | Grad Max: 2.529361 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.009759 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016153 | Grad Max: 0.016153 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003649 | Grad Max: 0.282990 -> Layer: exit2_layers.0.bias | Grad Mean: 0.068824 | Grad Max: 1.584049 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000833 | Grad Max: 0.018867 -> Layer: exit2_layers.3.bias | Grad Mean: 0.032034 | Grad Max: 0.107083 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000129 | Grad Max: 0.001597 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007063 | Grad Max: 0.016663 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000059 | Grad Max: 0.000650 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002300 | Grad Max: 0.005533 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004534 | Grad Max: 0.007986 -> Layer: exit2_layers.12.bias | Grad Mean: 0.070967 | Grad Max: 0.070967 [GRADIENT NORM TOTAL] 9.8683 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.343 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5027356 0.4972644] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 216/1832 | B: 251/1605 | C: 250/1798 [LOSS Ex1] A: 0.67820 | B: 0.67778 | C: 0.67398 [LOGITS Ex2 A] Mean Abs: 1.414 | Max: 5.135 [LOSS Ex2] A: 0.26965 | B: 0.40345 | C: 0.35129 ** [JOINT LOSS] ** : 1.018116 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005821 | Grad Max: 0.143468 -> Layer: shared_layers.0.bias | Grad Mean: 0.315927 | Grad Max: 1.666891 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001866 | Grad Max: 0.008445 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009779 | Grad Max: 0.009779 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002326 | Grad Max: 0.161288 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044320 | Grad Max: 0.877876 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000549 | Grad Max: 0.013886 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021215 | Grad Max: 0.079332 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001005 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004655 | Grad Max: 0.009999 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000429 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001527 | Grad Max: 0.003793 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002945 | Grad Max: 0.005965 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047742 | Grad Max: 0.047742 [GRADIENT NORM TOTAL] 6.2451 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.162 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55865943 0.4413405 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 237/1811 | B: 262/1786 | C: 231/1817 [LOSS Ex1] A: 0.67672 | B: 0.67734 | C: 0.67531 [LOGITS Ex2 A] Mean Abs: 1.372 | Max: 5.418 [LOSS Ex2] A: 0.26450 | B: 0.43684 | C: 0.37288 ** [JOINT LOSS] ** : 1.034530 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002619 | Grad Max: 0.076471 -> Layer: shared_layers.0.bias | Grad Mean: 0.140799 | Grad Max: 0.647826 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001896 | Grad Max: 0.008436 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008072 | Grad Max: 0.008072 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001003 | Grad Max: 0.107010 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018560 | Grad Max: 0.595377 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000217 | Grad Max: 0.006498 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008418 | Grad Max: 0.036111 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000444 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001883 | Grad Max: 0.004792 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000216 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000615 | Grad Max: 0.001733 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001294 | Grad Max: 0.002683 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019530 | Grad Max: 0.019530 [GRADIENT NORM TOTAL] 2.7186 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.216 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53533083 0.4646692 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 188/1428 | B: 291/1757 | C: 271/1777 [LOSS Ex1] A: 0.67581 | B: 0.67775 | C: 0.67264 [LOGITS Ex2 A] Mean Abs: 1.389 | Max: 6.229 [LOSS Ex2] A: 0.26224 | B: 0.42287 | C: 0.36336 ** [JOINT LOSS] ** : 1.024891 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004009 | Grad Max: 0.090402 -> Layer: shared_layers.0.bias | Grad Mean: 0.224128 | Grad Max: 1.146800 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.008771 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005162 | Grad Max: 0.005162 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001610 | Grad Max: 0.142671 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030206 | Grad Max: 0.804746 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.008072 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013925 | Grad Max: 0.045890 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000685 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003076 | Grad Max: 0.006926 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000287 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001001 | Grad Max: 0.002542 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002038 | Grad Max: 0.003954 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031502 | Grad Max: 0.031502 [GRADIENT NORM TOTAL] 4.4075 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50765514 0.49234492] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 250/1798 | B: 285/1763 | C: 261/1787 [LOSS Ex1] A: 0.67604 | B: 0.67532 | C: 0.67258 [LOGITS Ex2 A] Mean Abs: 1.402 | Max: 6.683 [LOSS Ex2] A: 0.26495 | B: 0.41150 | C: 0.33119 ** [JOINT LOSS] ** : 1.010530 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002241 | Grad Max: 0.045802 -> Layer: shared_layers.0.bias | Grad Mean: 0.121416 | Grad Max: 0.641891 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002009 | Grad Max: 0.008703 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006368 | Grad Max: 0.006368 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000892 | Grad Max: 0.098443 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016458 | Grad Max: 0.547800 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.006060 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007682 | Grad Max: 0.027814 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000464 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001701 | Grad Max: 0.004533 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000537 | Grad Max: 0.001490 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000991 | Grad Max: 0.002781 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015209 | Grad Max: 0.015209 [GRADIENT NORM TOTAL] 2.4760 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.325 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50278413 0.4972159 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 242/1806 | B: 254/1602 | C: 175/1201 [LOSS Ex1] A: 0.67529 | B: 0.67765 | C: 0.67191 [LOGITS Ex2 A] Mean Abs: 1.400 | Max: 6.019 [LOSS Ex2] A: 0.27239 | B: 0.41579 | C: 0.37350 ** [JOINT LOSS] ** : 1.028840 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004561 | Grad Max: 0.107642 -> Layer: shared_layers.0.bias | Grad Mean: 0.228635 | Grad Max: 1.123531 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002018 | Grad Max: 0.009119 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007090 | Grad Max: 0.007090 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001690 | Grad Max: 0.133100 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031421 | Grad Max: 0.734520 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.008223 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014180 | Grad Max: 0.048163 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000717 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003134 | Grad Max: 0.007344 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000330 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001017 | Grad Max: 0.002577 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001894 | Grad Max: 0.003989 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031262 | Grad Max: 0.031262 [GRADIENT NORM TOTAL] 4.5339 [EPOCH SUMMARY] Train Loss: 1.0283 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 1.0087 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 41/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.333 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040726 0.49592736] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 239/1809 | B: 265/1783 | C: 240/1808 [LOSS Ex1] A: 0.67438 | B: 0.67720 | C: 0.67354 [LOGITS Ex2 A] Mean Abs: 1.415 | Max: 5.954 [LOSS Ex2] A: 0.28264 | B: 0.44915 | C: 0.35189 ** [JOINT LOSS] ** : 1.036264 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007176 | Grad Max: 0.194677 -> Layer: shared_layers.0.bias | Grad Mean: 0.351506 | Grad Max: 1.723177 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.009838 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014813 | Grad Max: 0.014813 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002554 | Grad Max: 0.205361 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048120 | Grad Max: 1.119263 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000571 | Grad Max: 0.013766 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022048 | Grad Max: 0.075310 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.001012 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004868 | Grad Max: 0.010996 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000444 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001578 | Grad Max: 0.003834 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003023 | Grad Max: 0.005494 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048925 | Grad Max: 0.048925 [GRADIENT NORM TOTAL] 6.8108 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.285 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5069191 0.4930809] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 222/1826 | B: 292/1756 | C: 262/1786 [LOSS Ex1] A: 0.67757 | B: 0.67762 | C: 0.67236 [LOGITS Ex2 A] Mean Abs: 1.377 | Max: 5.228 [LOSS Ex2] A: 0.26599 | B: 0.42682 | C: 0.34859 ** [JOINT LOSS] ** : 1.022979 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003203 | Grad Max: 0.075445 -> Layer: shared_layers.0.bias | Grad Mean: 0.199351 | Grad Max: 0.956747 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001854 | Grad Max: 0.007668 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002904 | Grad Max: 0.002904 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001398 | Grad Max: 0.121428 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025945 | Grad Max: 0.678230 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000303 | Grad Max: 0.007347 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011794 | Grad Max: 0.040918 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000615 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002597 | Grad Max: 0.006086 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000263 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000849 | Grad Max: 0.002214 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001490 | Grad Max: 0.003936 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025545 | Grad Max: 0.025545 [GRADIENT NORM TOTAL] 3.9280 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.047 | Max: 0.197 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51615 0.48384997] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.518 | Std: 0.019 [MASKS] A(Pass/Fail): 225/1823 | B: 286/1762 | C: 241/1807 [LOSS Ex1] A: 0.67710 | B: 0.67518 | C: 0.67333 [LOGITS Ex2 A] Mean Abs: 1.303 | Max: 5.322 [LOSS Ex2] A: 0.28266 | B: 0.41595 | C: 0.36641 ** [JOINT LOSS] ** : 1.030211 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005290 | Grad Max: 0.119424 -> Layer: shared_layers.0.bias | Grad Mean: 0.283566 | Grad Max: 1.441921 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001942 | Grad Max: 0.008211 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000814 | Grad Max: 0.000814 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.224933 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038596 | Grad Max: 1.272223 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000464 | Grad Max: 0.010622 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017993 | Grad Max: 0.062494 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000851 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003969 | Grad Max: 0.008783 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000370 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001288 | Grad Max: 0.003139 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002448 | Grad Max: 0.004718 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038603 | Grad Max: 0.038603 [GRADIENT NORM TOTAL] 5.6415 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.311 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.56352425 0.4364758 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.020 [MASKS] A(Pass/Fail): 248/1800 | B: 255/1601 | C: 251/1797 [LOSS Ex1] A: 0.67544 | B: 0.67753 | C: 0.67233 [LOGITS Ex2 A] Mean Abs: 1.363 | Max: 5.709 [LOSS Ex2] A: 0.26039 | B: 0.42267 | C: 0.36363 ** [JOINT LOSS] ** : 1.023997 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006233 | Grad Max: 0.135248 -> Layer: shared_layers.0.bias | Grad Mean: 0.376423 | Grad Max: 1.854193 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.009471 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014678 | Grad Max: 0.014678 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002685 | Grad Max: 0.217696 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050581 | Grad Max: 1.222566 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000612 | Grad Max: 0.014255 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023693 | Grad Max: 0.079564 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.001133 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005199 | Grad Max: 0.011867 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000455 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001681 | Grad Max: 0.004032 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003206 | Grad Max: 0.006090 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051063 | Grad Max: 0.051063 [GRADIENT NORM TOTAL] 7.3864 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.347 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026144 0.4973856] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 228/1820 | B: 269/1779 | C: 279/1769 [LOSS Ex1] A: 0.67796 | B: 0.67709 | C: 0.67045 [LOGITS Ex2 A] Mean Abs: 1.375 | Max: 5.039 [LOSS Ex2] A: 0.24896 | B: 0.43449 | C: 0.35177 ** [JOINT LOSS] ** : 1.020241 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005210 | Grad Max: 0.131115 -> Layer: shared_layers.0.bias | Grad Mean: 0.227558 | Grad Max: 1.127227 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001915 | Grad Max: 0.007965 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003479 | Grad Max: 0.003479 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001681 | Grad Max: 0.138360 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031109 | Grad Max: 0.777677 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000371 | Grad Max: 0.009418 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014220 | Grad Max: 0.047524 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000701 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003171 | Grad Max: 0.007605 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000331 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001024 | Grad Max: 0.002757 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002115 | Grad Max: 0.003927 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031647 | Grad Max: 0.031647 [GRADIENT NORM TOTAL] 4.4075 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.164 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5596446 0.4403554] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 245/1803 | B: 295/1753 | C: 242/1806 [LOSS Ex1] A: 0.67648 | B: 0.67751 | C: 0.67471 [LOGITS Ex2 A] Mean Abs: 1.409 | Max: 5.157 [LOSS Ex2] A: 0.27450 | B: 0.43269 | C: 0.36316 ** [JOINT LOSS] ** : 1.033017 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003551 | Grad Max: 0.078374 -> Layer: shared_layers.0.bias | Grad Mean: 0.213775 | Grad Max: 1.092499 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001914 | Grad Max: 0.008703 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010556 | Grad Max: 0.010556 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001519 | Grad Max: 0.126509 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028666 | Grad Max: 0.739060 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000335 | Grad Max: 0.009424 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013043 | Grad Max: 0.051688 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000672 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002854 | Grad Max: 0.007163 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000281 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000918 | Grad Max: 0.002447 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001651 | Grad Max: 0.003530 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027240 | Grad Max: 0.027240 [GRADIENT NORM TOTAL] 4.2295 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.219 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5359657 0.46403435] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.020 [MASKS] A(Pass/Fail): 202/1414 | B: 288/1760 | C: 246/1802 [LOSS Ex1] A: 0.67557 | B: 0.67507 | C: 0.67510 [LOGITS Ex2 A] Mean Abs: 1.449 | Max: 7.200 [LOSS Ex2] A: 0.25691 | B: 0.42673 | C: 0.36736 ** [JOINT LOSS] ** : 1.025577 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005055 | Grad Max: 0.118786 -> Layer: shared_layers.0.bias | Grad Mean: 0.311351 | Grad Max: 1.564967 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.009144 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007635 | Grad Max: 0.007635 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.175879 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041289 | Grad Max: 1.006615 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000496 | Grad Max: 0.012126 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019271 | Grad Max: 0.069555 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000879 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004224 | Grad Max: 0.009123 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000377 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001371 | Grad Max: 0.003353 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002578 | Grad Max: 0.004964 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042173 | Grad Max: 0.042173 [GRADIENT NORM TOTAL] 6.0208 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.348 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5075727 0.4924273] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 258/1790 | B: 260/1596 | C: 244/1804 [LOSS Ex1] A: 0.67581 | B: 0.67743 | C: 0.67432 [LOGITS Ex2 A] Mean Abs: 1.415 | Max: 5.726 [LOSS Ex2] A: 0.26779 | B: 0.39479 | C: 0.37218 ** [JOINT LOSS] ** : 1.020772 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003095 | Grad Max: 0.066644 -> Layer: shared_layers.0.bias | Grad Mean: 0.155125 | Grad Max: 0.821169 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001930 | Grad Max: 0.008793 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009917 | Grad Max: 0.009917 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001184 | Grad Max: 0.102034 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022053 | Grad Max: 0.581166 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000249 | Grad Max: 0.006133 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009653 | Grad Max: 0.036841 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000492 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002130 | Grad Max: 0.005171 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000253 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000697 | Grad Max: 0.001903 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001311 | Grad Max: 0.003206 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021530 | Grad Max: 0.021530 [GRADIENT NORM TOTAL] 3.1550 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.328 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5028795 0.4971205] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 253/1795 | B: 272/1776 | C: 262/1786 [LOSS Ex1] A: 0.67504 | B: 0.67698 | C: 0.67404 [LOGITS Ex2 A] Mean Abs: 1.373 | Max: 5.088 [LOSS Ex2] A: 0.26964 | B: 0.43809 | C: 0.37342 ** [JOINT LOSS] ** : 1.035739 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004007 | Grad Max: 0.101677 -> Layer: shared_layers.0.bias | Grad Mean: 0.258778 | Grad Max: 1.241038 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.009419 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012977 | Grad Max: 0.012977 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001810 | Grad Max: 0.147480 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033868 | Grad Max: 0.842740 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000405 | Grad Max: 0.010139 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015750 | Grad Max: 0.054388 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000714 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003493 | Grad Max: 0.007852 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000352 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001148 | Grad Max: 0.002890 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002361 | Grad Max: 0.004381 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036629 | Grad Max: 0.036629 [GRADIENT NORM TOTAL] 4.9919 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.336 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.503973 0.49602696] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 248/1800 | B: 296/1752 | C: 238/1810 [LOSS Ex1] A: 0.67414 | B: 0.67741 | C: 0.67443 [LOGITS Ex2 A] Mean Abs: 1.353 | Max: 5.710 [LOSS Ex2] A: 0.27601 | B: 0.44080 | C: 0.38342 ** [JOINT LOSS] ** : 1.042070 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005498 | Grad Max: 0.141402 -> Layer: shared_layers.0.bias | Grad Mean: 0.342538 | Grad Max: 1.679522 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.008809 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006434 | Grad Max: 0.006434 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002384 | Grad Max: 0.246387 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045286 | Grad Max: 1.371594 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000543 | Grad Max: 0.013041 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021111 | Grad Max: 0.071785 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000928 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004615 | Grad Max: 0.009924 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000447 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001510 | Grad Max: 0.003825 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003039 | Grad Max: 0.005839 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047868 | Grad Max: 0.047868 [GRADIENT NORM TOTAL] 6.6943 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.287 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50682634 0.49317366] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 230/1818 | B: 290/1758 | C: 274/1774 [LOSS Ex1] A: 0.67738 | B: 0.67496 | C: 0.67080 [LOGITS Ex2 A] Mean Abs: 1.331 | Max: 5.437 [LOSS Ex2] A: 0.26364 | B: 0.40523 | C: 0.38011 ** [JOINT LOSS] ** : 1.024040 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003295 | Grad Max: 0.084674 -> Layer: shared_layers.0.bias | Grad Mean: 0.184452 | Grad Max: 0.804883 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001968 | Grad Max: 0.007202 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003365 | Grad Max: 0.003365 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001295 | Grad Max: 0.120673 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024136 | Grad Max: 0.691168 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.007284 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011300 | Grad Max: 0.040163 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000582 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002494 | Grad Max: 0.005793 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000261 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000802 | Grad Max: 0.002170 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001615 | Grad Max: 0.003693 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024642 | Grad Max: 0.024642 [GRADIENT NORM TOTAL] 3.4953 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.198 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.516367 0.48363304] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 231/1817 | B: 261/1595 | C: 235/1813 [LOSS Ex1] A: 0.67692 | B: 0.67733 | C: 0.67437 [LOGITS Ex2 A] Mean Abs: 1.359 | Max: 5.436 [LOSS Ex2] A: 0.27637 | B: 0.40349 | C: 0.37387 ** [JOINT LOSS] ** : 1.027451 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004180 | Grad Max: 0.091607 -> Layer: shared_layers.0.bias | Grad Mean: 0.226722 | Grad Max: 1.119768 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001779 | Grad Max: 0.007493 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002521 | Grad Max: 0.002521 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001619 | Grad Max: 0.135182 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030432 | Grad Max: 0.759091 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.011050 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013903 | Grad Max: 0.057103 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000688 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003058 | Grad Max: 0.006733 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000306 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000997 | Grad Max: 0.002587 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001917 | Grad Max: 0.004109 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030880 | Grad Max: 0.030880 [GRADIENT NORM TOTAL] 4.4044 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.314 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5644704 0.4355296] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.020 [MASKS] A(Pass/Fail): 256/1792 | B: 273/1775 | C: 282/1766 [LOSS Ex1] A: 0.67523 | B: 0.67688 | C: 0.67085 [LOGITS Ex2 A] Mean Abs: 1.419 | Max: 5.219 [LOSS Ex2] A: 0.26946 | B: 0.43785 | C: 0.35078 ** [JOINT LOSS] ** : 1.027016 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008139 | Grad Max: 0.209688 -> Layer: shared_layers.0.bias | Grad Mean: 0.403954 | Grad Max: 1.976498 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.009293 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013015 | Grad Max: 0.013015 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002923 | Grad Max: 0.222338 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054857 | Grad Max: 1.186575 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000657 | Grad Max: 0.014319 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025431 | Grad Max: 0.086142 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001126 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005623 | Grad Max: 0.012461 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000541 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001837 | Grad Max: 0.004484 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003572 | Grad Max: 0.007025 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057563 | Grad Max: 0.057563 [GRADIENT NORM TOTAL] 7.7500 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.350 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50253385 0.4974661 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 242/1806 | B: 298/1750 | C: 184/1192 [LOSS Ex1] A: 0.67779 | B: 0.67731 | C: 0.66958 [LOGITS Ex2 A] Mean Abs: 1.388 | Max: 5.126 [LOSS Ex2] A: 0.26434 | B: 0.43361 | C: 0.35407 ** [JOINT LOSS] ** : 1.025565 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002725 | Grad Max: 0.082938 -> Layer: shared_layers.0.bias | Grad Mean: 0.222302 | Grad Max: 1.085907 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.007930 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003392 | Grad Max: 0.003392 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001515 | Grad Max: 0.133731 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028767 | Grad Max: 0.752768 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000352 | Grad Max: 0.008620 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013794 | Grad Max: 0.050141 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000736 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002992 | Grad Max: 0.007082 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000287 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000971 | Grad Max: 0.002529 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001672 | Grad Max: 0.003733 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029033 | Grad Max: 0.029033 [GRADIENT NORM TOTAL] 4.3617 [EPOCH SUMMARY] Train Loss: 1.0282 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9974 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 1.0028 -> New: 0.9974) ############################## EPOCH 42/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.166 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.560389 0.43961108] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 252/1796 | B: 291/1757 | C: 247/1801 [LOSS Ex1] A: 0.67629 | B: 0.67486 | C: 0.67452 [LOGITS Ex2 A] Mean Abs: 1.359 | Max: 5.559 [LOSS Ex2] A: 0.26835 | B: 0.40203 | C: 0.34636 ** [JOINT LOSS] ** : 1.014135 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003245 | Grad Max: 0.071985 -> Layer: shared_layers.0.bias | Grad Mean: 0.192243 | Grad Max: 0.940855 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.009127 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013784 | Grad Max: 0.013784 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001357 | Grad Max: 0.168812 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025489 | Grad Max: 0.956262 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000292 | Grad Max: 0.007638 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011276 | Grad Max: 0.039603 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000571 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002489 | Grad Max: 0.005665 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000244 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000803 | Grad Max: 0.002068 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001619 | Grad Max: 0.003386 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024798 | Grad Max: 0.024798 [GRADIENT NORM TOTAL] 3.8409 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.222 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5364668 0.46353316] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.020 [MASKS] A(Pass/Fail): 208/1408 | B: 263/1593 | C: 249/1799 [LOSS Ex1] A: 0.67537 | B: 0.67724 | C: 0.67313 [LOGITS Ex2 A] Mean Abs: 1.383 | Max: 5.274 [LOSS Ex2] A: 0.24829 | B: 0.41124 | C: 0.35268 ** [JOINT LOSS] ** : 1.012651 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004859 | Grad Max: 0.107941 -> Layer: shared_layers.0.bias | Grad Mean: 0.294331 | Grad Max: 1.479696 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001954 | Grad Max: 0.009052 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007801 | Grad Max: 0.007801 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002091 | Grad Max: 0.207784 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039518 | Grad Max: 1.156150 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000469 | Grad Max: 0.012063 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018260 | Grad Max: 0.065612 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000897 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004021 | Grad Max: 0.009438 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000401 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001299 | Grad Max: 0.003338 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002494 | Grad Max: 0.005281 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039568 | Grad Max: 0.039568 [GRADIENT NORM TOTAL] 5.8424 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.351 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50751626 0.4924838 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 265/1783 | B: 276/1772 | C: 247/1801 [LOSS Ex1] A: 0.67562 | B: 0.67679 | C: 0.67298 [LOGITS Ex2 A] Mean Abs: 1.394 | Max: 6.754 [LOSS Ex2] A: 0.26664 | B: 0.43684 | C: 0.34925 ** [JOINT LOSS] ** : 1.026037 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003677 | Grad Max: 0.061192 -> Layer: shared_layers.0.bias | Grad Mean: 0.143387 | Grad Max: 0.702649 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002052 | Grad Max: 0.008997 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014131 | Grad Max: 0.014131 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001100 | Grad Max: 0.111442 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020079 | Grad Max: 0.612067 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000233 | Grad Max: 0.008326 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008948 | Grad Max: 0.039915 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000504 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001999 | Grad Max: 0.004856 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000199 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000643 | Grad Max: 0.001726 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001290 | Grad Max: 0.002896 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019358 | Grad Max: 0.019358 [GRADIENT NORM TOTAL] 2.8340 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.330 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029721 0.49702787] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.021 [MASKS] A(Pass/Fail): 266/1782 | B: 301/1747 | C: 262/1786 [LOSS Ex1] A: 0.67484 | B: 0.67722 | C: 0.67368 [LOGITS Ex2 A] Mean Abs: 1.422 | Max: 5.062 [LOSS Ex2] A: 0.26713 | B: 0.43228 | C: 0.38264 ** [JOINT LOSS] ** : 1.035930 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006201 | Grad Max: 0.156582 -> Layer: shared_layers.0.bias | Grad Mean: 0.276992 | Grad Max: 1.354113 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.009683 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016964 | Grad Max: 0.016964 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002005 | Grad Max: 0.171788 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037678 | Grad Max: 0.918789 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.010751 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017384 | Grad Max: 0.061440 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000851 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003842 | Grad Max: 0.009125 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000393 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001255 | Grad Max: 0.003160 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002456 | Grad Max: 0.004414 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038677 | Grad Max: 0.038677 [GRADIENT NORM TOTAL] 5.3559 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.339 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5038848 0.4961152] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.021 [MASKS] A(Pass/Fail): 257/1791 | B: 293/1755 | C: 289/1759 [LOSS Ex1] A: 0.67394 | B: 0.67476 | C: 0.67002 [LOGITS Ex2 A] Mean Abs: 1.404 | Max: 5.286 [LOSS Ex2] A: 0.28781 | B: 0.41574 | C: 0.35214 ** [JOINT LOSS] ** : 1.024804 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006875 | Grad Max: 0.192136 -> Layer: shared_layers.0.bias | Grad Mean: 0.356231 | Grad Max: 1.660360 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.008930 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004776 | Grad Max: 0.004776 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002577 | Grad Max: 0.210289 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048120 | Grad Max: 1.153459 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000574 | Grad Max: 0.012643 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022213 | Grad Max: 0.072620 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.001249 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004929 | Grad Max: 0.011534 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000444 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001607 | Grad Max: 0.003893 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003028 | Grad Max: 0.005811 -> Layer: exit2_layers.12.bias | Grad Mean: 0.049944 | Grad Max: 0.049944 [GRADIENT NORM TOTAL] 6.9627 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.289 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067477 0.49325228] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 236/1812 | B: 268/1588 | C: 219/1829 [LOSS Ex1] A: 0.67722 | B: 0.67715 | C: 0.67564 [LOGITS Ex2 A] Mean Abs: 1.375 | Max: 5.665 [LOSS Ex2] A: 0.25932 | B: 0.39650 | C: 0.38786 ** [JOINT LOSS] ** : 1.024563 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002890 | Grad Max: 0.063729 -> Layer: shared_layers.0.bias | Grad Mean: 0.157313 | Grad Max: 0.758484 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001735 | Grad Max: 0.007558 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002315 | Grad Max: 0.002315 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001123 | Grad Max: 0.110326 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021414 | Grad Max: 0.642948 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.006364 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010009 | Grad Max: 0.036666 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000522 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002182 | Grad Max: 0.005347 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000244 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000712 | Grad Max: 0.001942 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001233 | Grad Max: 0.003366 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021484 | Grad Max: 0.021484 [GRADIENT NORM TOTAL] 3.1693 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.200 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51657516 0.4834248 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 239/1809 | B: 277/1771 | C: 274/1774 [LOSS Ex1] A: 0.67675 | B: 0.67670 | C: 0.66950 [LOGITS Ex2 A] Mean Abs: 1.307 | Max: 5.596 [LOSS Ex2] A: 0.26268 | B: 0.43257 | C: 0.36114 ** [JOINT LOSS] ** : 1.026446 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005865 | Grad Max: 0.147466 -> Layer: shared_layers.0.bias | Grad Mean: 0.293617 | Grad Max: 1.492421 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002075 | Grad Max: 0.007506 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009379 | Grad Max: 0.009379 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.140582 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040154 | Grad Max: 0.782750 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000487 | Grad Max: 0.010976 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019006 | Grad Max: 0.064234 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000934 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004210 | Grad Max: 0.010082 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000374 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001362 | Grad Max: 0.003362 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002655 | Grad Max: 0.005012 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041677 | Grad Max: 0.041677 [GRADIENT NORM TOTAL] 5.6636 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.317 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5653543 0.43464568] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 265/1783 | B: 301/1747 | C: 258/1790 [LOSS Ex1] A: 0.67503 | B: 0.67713 | C: 0.67364 [LOGITS Ex2 A] Mean Abs: 1.346 | Max: 5.458 [LOSS Ex2] A: 0.24912 | B: 0.43067 | C: 0.36525 ** [JOINT LOSS] ** : 1.023616 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006408 | Grad Max: 0.135141 -> Layer: shared_layers.0.bias | Grad Mean: 0.371652 | Grad Max: 1.791725 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.009906 -> Layer: exit1_layers.0.bias | Grad Mean: 0.020798 | Grad Max: 0.020798 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002581 | Grad Max: 0.229560 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049112 | Grad Max: 1.299398 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000587 | Grad Max: 0.016037 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022885 | Grad Max: 0.086307 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001109 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005048 | Grad Max: 0.011963 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000476 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001630 | Grad Max: 0.003976 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002999 | Grad Max: 0.005468 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048582 | Grad Max: 0.048582 [GRADIENT NORM TOTAL] 7.0928 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.353 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5024494 0.49755055] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.021 [MASKS] A(Pass/Fail): 249/1799 | B: 295/1753 | C: 242/1806 [LOSS Ex1] A: 0.67762 | B: 0.67466 | C: 0.67320 [LOGITS Ex2 A] Mean Abs: 1.370 | Max: 5.132 [LOSS Ex2] A: 0.25147 | B: 0.40781 | C: 0.35806 ** [JOINT LOSS] ** : 1.014278 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004023 | Grad Max: 0.116079 -> Layer: shared_layers.0.bias | Grad Mean: 0.175196 | Grad Max: 0.844326 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001904 | Grad Max: 0.007840 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003986 | Grad Max: 0.003986 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001291 | Grad Max: 0.074996 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024176 | Grad Max: 0.422759 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000297 | Grad Max: 0.007762 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011530 | Grad Max: 0.040509 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000562 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002545 | Grad Max: 0.005893 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000222 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000817 | Grad Max: 0.002072 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001625 | Grad Max: 0.003535 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024985 | Grad Max: 0.024985 [GRADIENT NORM TOTAL] 3.3189 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.168 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.56112516 0.43887484] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.021 [MASKS] A(Pass/Fail): 259/1789 | B: 269/1587 | C: 239/1809 [LOSS Ex1] A: 0.67612 | B: 0.67707 | C: 0.67313 [LOGITS Ex2 A] Mean Abs: 1.406 | Max: 5.503 [LOSS Ex2] A: 0.27210 | B: 0.41012 | C: 0.34904 ** [JOINT LOSS] ** : 1.019190 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005228 | Grad Max: 0.140737 -> Layer: shared_layers.0.bias | Grad Mean: 0.251122 | Grad Max: 1.263717 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001861 | Grad Max: 0.008027 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001729 | Grad Max: 0.001729 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001866 | Grad Max: 0.149937 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034593 | Grad Max: 0.815124 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000396 | Grad Max: 0.009953 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015375 | Grad Max: 0.052636 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000781 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003417 | Grad Max: 0.008500 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000356 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001122 | Grad Max: 0.002847 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002111 | Grad Max: 0.004694 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034829 | Grad Max: 0.034829 [GRADIENT NORM TOTAL] 4.9167 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.224 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5369349 0.46306515] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 216/1400 | B: 278/1770 | C: 255/1793 [LOSS Ex1] A: 0.67520 | B: 0.67660 | C: 0.67273 [LOGITS Ex2 A] Mean Abs: 1.457 | Max: 4.766 [LOSS Ex2] A: 0.25749 | B: 0.43674 | C: 0.36848 ** [JOINT LOSS] ** : 1.029082 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006052 | Grad Max: 0.134945 -> Layer: shared_layers.0.bias | Grad Mean: 0.367893 | Grad Max: 1.830024 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001964 | Grad Max: 0.008417 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000336 | Grad Max: 0.000336 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002559 | Grad Max: 0.192549 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048747 | Grad Max: 1.058008 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000574 | Grad Max: 0.015514 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022417 | Grad Max: 0.087196 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.001115 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004920 | Grad Max: 0.012312 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000441 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001587 | Grad Max: 0.003953 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002823 | Grad Max: 0.005244 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047454 | Grad Max: 0.047454 [GRADIENT NORM TOTAL] 7.0894 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.354 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074354 0.49256462] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 276/1772 | B: 302/1746 | C: 294/1754 [LOSS Ex1] A: 0.67544 | B: 0.67704 | C: 0.66968 [LOGITS Ex2 A] Mean Abs: 1.418 | Max: 6.516 [LOSS Ex2] A: 0.27007 | B: 0.41802 | C: 0.34882 ** [JOINT LOSS] ** : 1.019690 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005046 | Grad Max: 0.125624 -> Layer: shared_layers.0.bias | Grad Mean: 0.249864 | Grad Max: 1.194422 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.008819 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007361 | Grad Max: 0.007361 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001824 | Grad Max: 0.154900 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034036 | Grad Max: 0.865000 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.010592 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015525 | Grad Max: 0.056564 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000768 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003443 | Grad Max: 0.007925 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000329 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001125 | Grad Max: 0.002770 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002089 | Grad Max: 0.004718 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034591 | Grad Max: 0.034591 [GRADIENT NORM TOTAL] 4.8767 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.333 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5030571 0.49694294] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 271/1777 | B: 296/1752 | C: 257/1791 [LOSS Ex1] A: 0.67465 | B: 0.67457 | C: 0.67204 [LOGITS Ex2 A] Mean Abs: 1.363 | Max: 5.103 [LOSS Ex2] A: 0.24657 | B: 0.40765 | C: 0.36951 ** [JOINT LOSS] ** : 1.014997 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002818 | Grad Max: 0.069635 -> Layer: shared_layers.0.bias | Grad Mean: 0.157785 | Grad Max: 0.813307 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.009153 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007398 | Grad Max: 0.007398 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001142 | Grad Max: 0.124177 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021317 | Grad Max: 0.703826 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000250 | Grad Max: 0.006136 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009718 | Grad Max: 0.034365 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000493 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002137 | Grad Max: 0.005599 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000217 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000684 | Grad Max: 0.001861 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001268 | Grad Max: 0.003185 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019984 | Grad Max: 0.019984 [GRADIENT NORM TOTAL] 3.2439 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.342 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.503814 0.49618596] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 264/1784 | B: 270/1586 | C: 161/1215 [LOSS Ex1] A: 0.67375 | B: 0.67698 | C: 0.67302 [LOGITS Ex2 A] Mean Abs: 1.354 | Max: 5.725 [LOSS Ex2] A: 0.27833 | B: 0.39712 | C: 0.38267 ** [JOINT LOSS] ** : 1.027295 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003089 | Grad Max: 0.086863 -> Layer: shared_layers.0.bias | Grad Mean: 0.244588 | Grad Max: 1.171963 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002013 | Grad Max: 0.009309 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008512 | Grad Max: 0.008512 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001605 | Grad Max: 0.145007 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030224 | Grad Max: 0.822676 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000362 | Grad Max: 0.009078 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014290 | Grad Max: 0.050467 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000698 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003133 | Grad Max: 0.007120 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000336 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001006 | Grad Max: 0.002589 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001757 | Grad Max: 0.003343 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029649 | Grad Max: 0.029649 [GRADIENT NORM TOTAL] 4.7026 [EPOCH SUMMARY] Train Loss: 1.0223 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9924 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9974 -> New: 0.9924) ############################## EPOCH 43/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.291 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5066921 0.49330786] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 243/1805 | B: 279/1769 | C: 269/1779 [LOSS Ex1] A: 0.67706 | B: 0.67652 | C: 0.67198 [LOGITS Ex2 A] Mean Abs: 1.354 | Max: 5.523 [LOSS Ex2] A: 0.25059 | B: 0.42918 | C: 0.36959 ** [JOINT LOSS] ** : 1.024970 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002951 | Grad Max: 0.079720 -> Layer: shared_layers.0.bias | Grad Mean: 0.122075 | Grad Max: 0.578684 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001824 | Grad Max: 0.006836 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003978 | Grad Max: 0.003978 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000967 | Grad Max: 0.116589 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017709 | Grad Max: 0.660132 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000212 | Grad Max: 0.005529 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008220 | Grad Max: 0.031608 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000548 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001823 | Grad Max: 0.004733 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000212 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000594 | Grad Max: 0.001613 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001302 | Grad Max: 0.003032 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018783 | Grad Max: 0.018783 [GRADIENT NORM TOTAL] 2.5501 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.048 | Max: 0.201 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51673365 0.48326635] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 247/1801 | B: 303/1745 | C: 260/1788 [LOSS Ex1] A: 0.67660 | B: 0.67695 | C: 0.67149 [LOGITS Ex2 A] Mean Abs: 1.368 | Max: 5.201 [LOSS Ex2] A: 0.28344 | B: 0.43096 | C: 0.37909 ** [JOINT LOSS] ** : 1.039511 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005472 | Grad Max: 0.160780 -> Layer: shared_layers.0.bias | Grad Mean: 0.309657 | Grad Max: 1.508895 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001889 | Grad Max: 0.007485 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004584 | Grad Max: 0.004584 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.165365 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039824 | Grad Max: 0.924650 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000475 | Grad Max: 0.013028 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018620 | Grad Max: 0.070850 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000849 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004089 | Grad Max: 0.009083 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000367 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001319 | Grad Max: 0.003312 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002388 | Grad Max: 0.004630 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039238 | Grad Max: 0.039238 [GRADIENT NORM TOTAL] 5.9282 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.319 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5661755 0.43382445] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 271/1777 | B: 297/1751 | C: 241/1807 [LOSS Ex1] A: 0.67485 | B: 0.67448 | C: 0.67279 [LOGITS Ex2 A] Mean Abs: 1.410 | Max: 5.422 [LOSS Ex2] A: 0.27022 | B: 0.42949 | C: 0.38335 ** [JOINT LOSS] ** : 1.035055 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007833 | Grad Max: 0.194120 -> Layer: shared_layers.0.bias | Grad Mean: 0.430691 | Grad Max: 2.150725 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.008956 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009239 | Grad Max: 0.009239 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003110 | Grad Max: 0.253180 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058288 | Grad Max: 1.407096 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000687 | Grad Max: 0.017135 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026791 | Grad Max: 0.097624 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001235 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005929 | Grad Max: 0.013360 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000519 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001925 | Grad Max: 0.004514 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003682 | Grad Max: 0.006738 -> Layer: exit2_layers.12.bias | Grad Mean: 0.059106 | Grad Max: 0.059106 [GRADIENT NORM TOTAL] 8.4564 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.356 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5024189 0.49758106] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 256/1792 | B: 270/1586 | C: 245/1803 [LOSS Ex1] A: 0.67746 | B: 0.67690 | C: 0.67210 [LOGITS Ex2 A] Mean Abs: 1.400 | Max: 4.967 [LOSS Ex2] A: 0.26817 | B: 0.39925 | C: 0.34570 ** [JOINT LOSS] ** : 1.013193 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003680 | Grad Max: 0.112252 -> Layer: shared_layers.0.bias | Grad Mean: 0.255038 | Grad Max: 1.315869 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001937 | Grad Max: 0.008546 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010209 | Grad Max: 0.010209 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001800 | Grad Max: 0.167749 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034462 | Grad Max: 0.935517 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000422 | Grad Max: 0.011430 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016768 | Grad Max: 0.059249 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000769 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003674 | Grad Max: 0.008054 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000341 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001205 | Grad Max: 0.003150 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002236 | Grad Max: 0.004949 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037738 | Grad Max: 0.037738 [GRADIENT NORM TOTAL] 5.0867 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.170 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5617827 0.4382173] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 265/1783 | B: 282/1766 | C: 262/1786 [LOSS Ex1] A: 0.67594 | B: 0.67643 | C: 0.67142 [LOGITS Ex2 A] Mean Abs: 1.360 | Max: 5.122 [LOSS Ex2] A: 0.26887 | B: 0.43300 | C: 0.35661 ** [JOINT LOSS] ** : 1.027426 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003716 | Grad Max: 0.098585 -> Layer: shared_layers.0.bias | Grad Mean: 0.153273 | Grad Max: 0.712122 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001981 | Grad Max: 0.008522 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006845 | Grad Max: 0.006845 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001123 | Grad Max: 0.164609 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020762 | Grad Max: 0.924113 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000231 | Grad Max: 0.006604 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008864 | Grad Max: 0.036241 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000567 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001978 | Grad Max: 0.005204 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000207 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000634 | Grad Max: 0.001756 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001202 | Grad Max: 0.002530 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018372 | Grad Max: 0.018372 [GRADIENT NORM TOTAL] 3.1231 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.226 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53736824 0.46263176] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 221/1395 | B: 304/1744 | C: 274/1774 [LOSS Ex1] A: 0.67501 | B: 0.67687 | C: 0.67060 [LOGITS Ex2 A] Mean Abs: 1.369 | Max: 5.277 [LOSS Ex2] A: 0.25428 | B: 0.43338 | C: 0.36876 ** [JOINT LOSS] ** : 1.026299 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006058 | Grad Max: 0.133969 -> Layer: shared_layers.0.bias | Grad Mean: 0.328102 | Grad Max: 1.614570 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.007982 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001877 | Grad Max: 0.001877 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002354 | Grad Max: 0.250919 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044153 | Grad Max: 1.395669 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000523 | Grad Max: 0.013465 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020381 | Grad Max: 0.071994 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.001074 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004517 | Grad Max: 0.011435 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000403 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001469 | Grad Max: 0.003584 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002796 | Grad Max: 0.005066 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045254 | Grad Max: 0.045254 [GRADIENT NORM TOTAL] 6.4338 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.357 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074517 0.4925483] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 282/1766 | B: 299/1749 | C: 254/1794 [LOSS Ex1] A: 0.67525 | B: 0.67439 | C: 0.67315 [LOGITS Ex2 A] Mean Abs: 1.387 | Max: 5.610 [LOSS Ex2] A: 0.25586 | B: 0.41129 | C: 0.35814 ** [JOINT LOSS] ** : 1.016028 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003504 | Grad Max: 0.076814 -> Layer: shared_layers.0.bias | Grad Mean: 0.140381 | Grad Max: 0.701833 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.009409 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015277 | Grad Max: 0.015277 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001025 | Grad Max: 0.069544 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019118 | Grad Max: 0.382909 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000231 | Grad Max: 0.005851 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008925 | Grad Max: 0.032916 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000457 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001983 | Grad Max: 0.004860 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000230 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000636 | Grad Max: 0.001844 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001276 | Grad Max: 0.002840 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019472 | Grad Max: 0.019472 [GRADIENT NORM TOTAL] 2.6492 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.335 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5030843 0.4969157] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 275/1773 | B: 272/1584 | C: 256/1792 [LOSS Ex1] A: 0.67445 | B: 0.67682 | C: 0.67160 [LOGITS Ex2 A] Mean Abs: 1.396 | Max: 5.106 [LOSS Ex2] A: 0.26141 | B: 0.39613 | C: 0.35134 ** [JOINT LOSS] ** : 1.010583 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005455 | Grad Max: 0.125990 -> Layer: shared_layers.0.bias | Grad Mean: 0.268974 | Grad Max: 1.327690 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.009384 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011663 | Grad Max: 0.011663 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001938 | Grad Max: 0.170389 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036571 | Grad Max: 0.938565 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000432 | Grad Max: 0.010165 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016936 | Grad Max: 0.057904 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000847 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003738 | Grad Max: 0.009464 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000355 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001214 | Grad Max: 0.003026 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002163 | Grad Max: 0.005048 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036970 | Grad Max: 0.036970 [GRADIENT NORM TOTAL] 5.2568 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5038058 0.49619418] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 274/1774 | B: 283/1765 | C: 252/1796 [LOSS Ex1] A: 0.67356 | B: 0.67635 | C: 0.67253 [LOGITS Ex2 A] Mean Abs: 1.426 | Max: 5.995 [LOSS Ex2] A: 0.28927 | B: 0.43907 | C: 0.37773 ** [JOINT LOSS] ** : 1.042833 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008052 | Grad Max: 0.259024 -> Layer: shared_layers.0.bias | Grad Mean: 0.340459 | Grad Max: 1.586569 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.009560 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009880 | Grad Max: 0.009880 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002545 | Grad Max: 0.200083 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047109 | Grad Max: 1.064484 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000550 | Grad Max: 0.012004 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021288 | Grad Max: 0.068403 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.001003 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004763 | Grad Max: 0.010079 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000442 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001540 | Grad Max: 0.003706 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002912 | Grad Max: 0.005224 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047192 | Grad Max: 0.047192 [GRADIENT NORM TOTAL] 6.5604 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.293 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5066951 0.49330494] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.021 [MASKS] A(Pass/Fail): 250/1798 | B: 306/1742 | C: 254/1794 [LOSS Ex1] A: 0.67689 | B: 0.67678 | C: 0.67043 [LOGITS Ex2 A] Mean Abs: 1.368 | Max: 5.048 [LOSS Ex2] A: 0.25541 | B: 0.41976 | C: 0.35367 ** [JOINT LOSS] ** : 1.017642 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004365 | Grad Max: 0.086541 -> Layer: shared_layers.0.bias | Grad Mean: 0.228516 | Grad Max: 1.125059 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001902 | Grad Max: 0.007729 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003344 | Grad Max: 0.003344 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001626 | Grad Max: 0.137325 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030674 | Grad Max: 0.772786 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.009477 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014144 | Grad Max: 0.053398 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000636 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003117 | Grad Max: 0.006771 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000297 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001022 | Grad Max: 0.002494 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001773 | Grad Max: 0.004331 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030790 | Grad Max: 0.030790 [GRADIENT NORM TOTAL] 4.4894 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.202 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51688373 0.48311627] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.020 [MASKS] A(Pass/Fail): 257/1791 | B: 300/1748 | C: 258/1790 [LOSS Ex1] A: 0.67642 | B: 0.67430 | C: 0.67200 [LOGITS Ex2 A] Mean Abs: 1.316 | Max: 5.458 [LOSS Ex2] A: 0.27126 | B: 0.41224 | C: 0.35883 ** [JOINT LOSS] ** : 1.021685 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003560 | Grad Max: 0.116675 -> Layer: shared_layers.0.bias | Grad Mean: 0.269759 | Grad Max: 1.426335 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001938 | Grad Max: 0.007976 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001862 | Grad Max: 0.001862 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001872 | Grad Max: 0.144116 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035414 | Grad Max: 0.813111 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.010910 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017002 | Grad Max: 0.062045 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000851 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003730 | Grad Max: 0.008863 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000315 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001200 | Grad Max: 0.002943 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002140 | Grad Max: 0.004118 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035746 | Grad Max: 0.035746 [GRADIENT NORM TOTAL] 5.3139 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.322 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5669801 0.4330199] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.021 [MASKS] A(Pass/Fail): 274/1774 | B: 273/1583 | C: 235/1813 [LOSS Ex1] A: 0.67465 | B: 0.67673 | C: 0.67526 [LOGITS Ex2 A] Mean Abs: 1.338 | Max: 5.417 [LOSS Ex2] A: 0.25177 | B: 0.41126 | C: 0.34666 ** [JOINT LOSS] ** : 1.012110 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005366 | Grad Max: 0.120533 -> Layer: shared_layers.0.bias | Grad Mean: 0.353358 | Grad Max: 1.737320 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002067 | Grad Max: 0.009447 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017989 | Grad Max: 0.017989 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002403 | Grad Max: 0.201455 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045920 | Grad Max: 1.135480 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000555 | Grad Max: 0.013514 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021909 | Grad Max: 0.077363 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.001021 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004805 | Grad Max: 0.011059 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000446 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001533 | Grad Max: 0.003930 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002632 | Grad Max: 0.005222 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044081 | Grad Max: 0.044081 [GRADIENT NORM TOTAL] 6.7998 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.359 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023944 0.49760556] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 269/1779 | B: 284/1764 | C: 274/1774 [LOSS Ex1] A: 0.67730 | B: 0.67626 | C: 0.67316 [LOGITS Ex2 A] Mean Abs: 1.368 | Max: 5.265 [LOSS Ex2] A: 0.24454 | B: 0.43325 | C: 0.33926 ** [JOINT LOSS] ** : 1.014588 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003844 | Grad Max: 0.098388 -> Layer: shared_layers.0.bias | Grad Mean: 0.196788 | Grad Max: 1.007794 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001936 | Grad Max: 0.008163 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012444 | Grad Max: 0.012444 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001412 | Grad Max: 0.141250 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026194 | Grad Max: 0.776575 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000301 | Grad Max: 0.008764 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011820 | Grad Max: 0.046803 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000602 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002637 | Grad Max: 0.006604 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000242 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000851 | Grad Max: 0.002167 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001681 | Grad Max: 0.003663 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026126 | Grad Max: 0.026126 [GRADIENT NORM TOTAL] 3.9258 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.172 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.56246275 0.43753725] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 272/1776 | B: 306/1742 | C: 167/1209 [LOSS Ex1] A: 0.67576 | B: 0.67670 | C: 0.67450 [LOGITS Ex2 A] Mean Abs: 1.388 | Max: 5.251 [LOSS Ex2] A: 0.27255 | B: 0.41973 | C: 0.34524 ** [JOINT LOSS] ** : 1.021495 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002520 | Grad Max: 0.070758 -> Layer: shared_layers.0.bias | Grad Mean: 0.194652 | Grad Max: 0.945777 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001904 | Grad Max: 0.008367 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007076 | Grad Max: 0.007076 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001333 | Grad Max: 0.127047 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024845 | Grad Max: 0.721132 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000290 | Grad Max: 0.007369 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011429 | Grad Max: 0.042497 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000582 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002512 | Grad Max: 0.006295 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000253 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000822 | Grad Max: 0.002051 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001447 | Grad Max: 0.003715 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024500 | Grad Max: 0.024500 [GRADIENT NORM TOTAL] 3.8784 [EPOCH SUMMARY] Train Loss: 1.0231 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9993 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 44/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.229 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5378378 0.46216223] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 224/1392 | B: 303/1745 | C: 253/1795 [LOSS Ex1] A: 0.67483 | B: 0.67421 | C: 0.67385 [LOGITS Ex2 A] Mean Abs: 1.453 | Max: 5.693 [LOSS Ex2] A: 0.25450 | B: 0.41173 | C: 0.35859 ** [JOINT LOSS] ** : 1.015903 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005083 | Grad Max: 0.116040 -> Layer: shared_layers.0.bias | Grad Mean: 0.312213 | Grad Max: 1.579054 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002011 | Grad Max: 0.009108 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007810 | Grad Max: 0.007810 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.159567 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041079 | Grad Max: 0.864758 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000483 | Grad Max: 0.011965 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019050 | Grad Max: 0.068729 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000886 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004206 | Grad Max: 0.009339 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000428 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001370 | Grad Max: 0.003505 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002485 | Grad Max: 0.005232 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042010 | Grad Max: 0.042010 [GRADIENT NORM TOTAL] 6.0523 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.360 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073994 0.49260062] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 288/1760 | B: 273/1583 | C: 262/1786 [LOSS Ex1] A: 0.67507 | B: 0.67665 | C: 0.67245 [LOGITS Ex2 A] Mean Abs: 1.416 | Max: 6.254 [LOSS Ex2] A: 0.25181 | B: 0.39213 | C: 0.35256 ** [JOINT LOSS] ** : 1.006893 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002941 | Grad Max: 0.058209 -> Layer: shared_layers.0.bias | Grad Mean: 0.138921 | Grad Max: 0.722374 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.009176 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012438 | Grad Max: 0.012438 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001014 | Grad Max: 0.108670 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018897 | Grad Max: 0.614398 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000219 | Grad Max: 0.005743 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008579 | Grad Max: 0.031882 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000419 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001902 | Grad Max: 0.004603 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000204 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000629 | Grad Max: 0.001703 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001148 | Grad Max: 0.003563 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019984 | Grad Max: 0.019984 [GRADIENT NORM TOTAL] 2.7780 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.338 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50319266 0.4968073 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 283/1765 | B: 284/1764 | C: 266/1782 [LOSS Ex1] A: 0.67426 | B: 0.67617 | C: 0.67092 [LOGITS Ex2 A] Mean Abs: 1.371 | Max: 5.308 [LOSS Ex2] A: 0.25009 | B: 0.42642 | C: 0.36296 ** [JOINT LOSS] ** : 1.020274 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005246 | Grad Max: 0.108249 -> Layer: shared_layers.0.bias | Grad Mean: 0.292851 | Grad Max: 1.400569 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.009161 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010366 | Grad Max: 0.010366 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002037 | Grad Max: 0.192296 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038383 | Grad Max: 1.103268 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000447 | Grad Max: 0.011665 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017485 | Grad Max: 0.063907 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000818 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003860 | Grad Max: 0.008552 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000359 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001248 | Grad Max: 0.003020 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002414 | Grad Max: 0.004422 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038175 | Grad Max: 0.038175 [GRADIENT NORM TOTAL] 5.6127 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.347 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50373065 0.4962694 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 283/1765 | B: 306/1742 | C: 249/1799 [LOSS Ex1] A: 0.67336 | B: 0.67661 | C: 0.67342 [LOGITS Ex2 A] Mean Abs: 1.350 | Max: 6.103 [LOSS Ex2] A: 0.27520 | B: 0.43690 | C: 0.36550 ** [JOINT LOSS] ** : 1.033664 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005557 | Grad Max: 0.152910 -> Layer: shared_layers.0.bias | Grad Mean: 0.391826 | Grad Max: 1.957836 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.008917 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010393 | Grad Max: 0.010393 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002662 | Grad Max: 0.218444 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050725 | Grad Max: 1.259843 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000601 | Grad Max: 0.015108 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023802 | Grad Max: 0.087808 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001109 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005216 | Grad Max: 0.012148 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000452 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001679 | Grad Max: 0.004009 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003002 | Grad Max: 0.005566 -> Layer: exit2_layers.12.bias | Grad Mean: 0.049889 | Grad Max: 0.049889 [GRADIENT NORM TOTAL] 7.5802 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.295 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5066146 0.49338534] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.021 [MASKS] A(Pass/Fail): 258/1790 | B: 303/1745 | C: 269/1779 [LOSS Ex1] A: 0.67674 | B: 0.67412 | C: 0.66988 [LOGITS Ex2 A] Mean Abs: 1.348 | Max: 5.113 [LOSS Ex2] A: 0.25316 | B: 0.41712 | C: 0.36839 ** [JOINT LOSS] ** : 1.019799 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003726 | Grad Max: 0.096601 -> Layer: shared_layers.0.bias | Grad Mean: 0.180531 | Grad Max: 0.867577 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.007368 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001185 | Grad Max: 0.001185 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001301 | Grad Max: 0.120370 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024289 | Grad Max: 0.676916 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000286 | Grad Max: 0.009067 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011207 | Grad Max: 0.045206 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000577 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002496 | Grad Max: 0.005781 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000271 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000802 | Grad Max: 0.002161 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001578 | Grad Max: 0.003309 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023977 | Grad Max: 0.023977 [GRADIENT NORM TOTAL] 3.4991 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.204 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5171022 0.48289782] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.021 [MASKS] A(Pass/Fail): 265/1783 | B: 273/1583 | C: 263/1785 [LOSS Ex1] A: 0.67628 | B: 0.67656 | C: 0.67283 [LOGITS Ex2 A] Mean Abs: 1.359 | Max: 5.129 [LOSS Ex2] A: 0.26623 | B: 0.39599 | C: 0.35871 ** [JOINT LOSS] ** : 1.015533 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004610 | Grad Max: 0.114234 -> Layer: shared_layers.0.bias | Grad Mean: 0.209444 | Grad Max: 1.037751 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001798 | Grad Max: 0.007805 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000801 | Grad Max: 0.000801 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001503 | Grad Max: 0.129605 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028035 | Grad Max: 0.695996 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.008561 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012974 | Grad Max: 0.050246 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000601 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002891 | Grad Max: 0.006848 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000283 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000940 | Grad Max: 0.002529 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001615 | Grad Max: 0.004051 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027667 | Grad Max: 0.027667 [GRADIENT NORM TOTAL] 4.0473 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.324 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.56782967 0.43217036] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 284/1764 | B: 286/1762 | C: 241/1807 [LOSS Ex1] A: 0.67447 | B: 0.67608 | C: 0.67325 [LOGITS Ex2 A] Mean Abs: 1.427 | Max: 5.249 [LOSS Ex2] A: 0.26191 | B: 0.43631 | C: 0.35280 ** [JOINT LOSS] ** : 1.024939 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006954 | Grad Max: 0.160299 -> Layer: shared_layers.0.bias | Grad Mean: 0.373784 | Grad Max: 1.817933 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002145 | Grad Max: 0.009430 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017535 | Grad Max: 0.017535 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002611 | Grad Max: 0.193216 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049154 | Grad Max: 1.044224 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000582 | Grad Max: 0.013893 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022849 | Grad Max: 0.080487 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.001058 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005059 | Grad Max: 0.011310 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000435 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001650 | Grad Max: 0.003930 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003029 | Grad Max: 0.006041 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050060 | Grad Max: 0.050060 [GRADIENT NORM TOTAL] 7.1650 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.361 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022903 0.49770972] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 277/1771 | B: 306/1742 | C: 270/1778 [LOSS Ex1] A: 0.67714 | B: 0.67652 | C: 0.67050 [LOGITS Ex2 A] Mean Abs: 1.404 | Max: 5.273 [LOSS Ex2] A: 0.25343 | B: 0.41965 | C: 0.33822 ** [JOINT LOSS] ** : 1.011823 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002622 | Grad Max: 0.062794 -> Layer: shared_layers.0.bias | Grad Mean: 0.184800 | Grad Max: 0.925505 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001926 | Grad Max: 0.008337 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008297 | Grad Max: 0.008297 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001266 | Grad Max: 0.125008 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024129 | Grad Max: 0.710728 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000290 | Grad Max: 0.007644 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011465 | Grad Max: 0.045001 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000516 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002505 | Grad Max: 0.005702 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000263 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000821 | Grad Max: 0.002231 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001382 | Grad Max: 0.003939 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024169 | Grad Max: 0.024169 [GRADIENT NORM TOTAL] 3.6749 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.174 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.56318676 0.43681327] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 280/1768 | B: 304/1744 | C: 263/1785 [LOSS Ex1] A: 0.67560 | B: 0.67403 | C: 0.67077 [LOGITS Ex2 A] Mean Abs: 1.362 | Max: 5.385 [LOSS Ex2] A: 0.26185 | B: 0.41472 | C: 0.37532 ** [JOINT LOSS] ** : 1.024097 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004978 | Grad Max: 0.106427 -> Layer: shared_layers.0.bias | Grad Mean: 0.256051 | Grad Max: 1.236791 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.008237 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001598 | Grad Max: 0.001598 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001860 | Grad Max: 0.212107 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035026 | Grad Max: 1.197060 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000413 | Grad Max: 0.009701 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016308 | Grad Max: 0.058455 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000831 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003612 | Grad Max: 0.008735 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000354 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001164 | Grad Max: 0.002936 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002055 | Grad Max: 0.003865 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033843 | Grad Max: 0.033843 [GRADIENT NORM TOTAL] 5.0603 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.231 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53832185 0.46167815] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 225/1391 | B: 274/1582 | C: 251/1797 [LOSS Ex1] A: 0.67466 | B: 0.67648 | C: 0.67261 [LOGITS Ex2 A] Mean Abs: 1.401 | Max: 5.615 [LOSS Ex2] A: 0.24778 | B: 0.41122 | C: 0.36962 ** [JOINT LOSS] ** : 1.017459 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006887 | Grad Max: 0.172563 -> Layer: shared_layers.0.bias | Grad Mean: 0.375597 | Grad Max: 1.822445 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.008975 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007150 | Grad Max: 0.007150 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002700 | Grad Max: 0.253024 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050374 | Grad Max: 1.426127 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000599 | Grad Max: 0.014385 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023549 | Grad Max: 0.084621 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001042 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005241 | Grad Max: 0.011352 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000463 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001703 | Grad Max: 0.004131 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003250 | Grad Max: 0.005786 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052325 | Grad Max: 0.052325 [GRADIENT NORM TOTAL] 7.3245 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.362 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073206 0.49267936] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 299/1749 | B: 287/1761 | C: 254/1794 [LOSS Ex1] A: 0.67490 | B: 0.67600 | C: 0.67133 [LOGITS Ex2 A] Mean Abs: 1.391 | Max: 6.943 [LOSS Ex2] A: 0.24415 | B: 0.43517 | C: 0.35157 ** [JOINT LOSS] ** : 1.017711 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003389 | Grad Max: 0.081676 -> Layer: shared_layers.0.bias | Grad Mean: 0.170885 | Grad Max: 0.915802 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.008935 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008734 | Grad Max: 0.008734 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001325 | Grad Max: 0.215941 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024509 | Grad Max: 1.218868 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000286 | Grad Max: 0.006471 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011261 | Grad Max: 0.040750 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000569 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002513 | Grad Max: 0.005960 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000235 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000804 | Grad Max: 0.002179 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001491 | Grad Max: 0.003227 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023166 | Grad Max: 0.023166 [GRADIENT NORM TOTAL] 3.6948 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.340 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5032924 0.49670762] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 290/1758 | B: 308/1740 | C: 271/1777 [LOSS Ex1] A: 0.67408 | B: 0.67644 | C: 0.67015 [LOGITS Ex2 A] Mean Abs: 1.424 | Max: 5.269 [LOSS Ex2] A: 0.25091 | B: 0.42451 | C: 0.36227 ** [JOINT LOSS] ** : 1.019452 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005047 | Grad Max: 0.115706 -> Layer: shared_layers.0.bias | Grad Mean: 0.255155 | Grad Max: 1.212954 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.009222 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008801 | Grad Max: 0.008801 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001774 | Grad Max: 0.166766 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033567 | Grad Max: 0.896113 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000396 | Grad Max: 0.009545 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015529 | Grad Max: 0.057359 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000832 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003418 | Grad Max: 0.008288 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000357 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001097 | Grad Max: 0.002788 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001894 | Grad Max: 0.004237 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031582 | Grad Max: 0.031582 [GRADIENT NORM TOTAL] 4.9123 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.349 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5036386 0.4963614] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 294/1754 | B: 306/1742 | C: 231/1817 [LOSS Ex1] A: 0.67318 | B: 0.67393 | C: 0.67197 [LOGITS Ex2 A] Mean Abs: 1.437 | Max: 5.913 [LOSS Ex2] A: 0.27640 | B: 0.42214 | C: 0.36433 ** [JOINT LOSS] ** : 1.027318 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007222 | Grad Max: 0.209688 -> Layer: shared_layers.0.bias | Grad Mean: 0.339877 | Grad Max: 1.616648 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002107 | Grad Max: 0.008099 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002559 | Grad Max: 0.002559 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002469 | Grad Max: 0.206477 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045944 | Grad Max: 1.071246 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000532 | Grad Max: 0.012984 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020850 | Grad Max: 0.071930 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000958 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004657 | Grad Max: 0.011068 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000404 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001512 | Grad Max: 0.003603 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002839 | Grad Max: 0.005235 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045712 | Grad Max: 0.045712 [GRADIENT NORM TOTAL] 6.5483 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50654405 0.49345592] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.519 | Std: 0.021 [MASKS] A(Pass/Fail): 263/1785 | B: 275/1581 | C: 162/1214 [LOSS Ex1] A: 0.67658 | B: 0.67640 | C: 0.67438 [LOGITS Ex2 A] Mean Abs: 1.390 | Max: 5.419 [LOSS Ex2] A: 0.25223 | B: 0.40328 | C: 0.35823 ** [JOINT LOSS] ** : 1.013701 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002801 | Grad Max: 0.057848 -> Layer: shared_layers.0.bias | Grad Mean: 0.185146 | Grad Max: 0.813883 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001798 | Grad Max: 0.007413 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005241 | Grad Max: 0.005241 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001269 | Grad Max: 0.118703 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023855 | Grad Max: 0.666920 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000277 | Grad Max: 0.007695 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011151 | Grad Max: 0.047841 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000649 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002467 | Grad Max: 0.005877 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000261 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000809 | Grad Max: 0.002107 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001416 | Grad Max: 0.003828 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024632 | Grad Max: 0.024632 [GRADIENT NORM TOTAL] 3.6074 [EPOCH SUMMARY] Train Loss: 1.0192 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9926 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 45/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.049 | Max: 0.205 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5172878 0.4827122] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.519 | Std: 0.021 [MASKS] A(Pass/Fail): 270/1778 | B: 290/1758 | C: 268/1780 [LOSS Ex1] A: 0.67612 | B: 0.67591 | C: 0.67102 [LOGITS Ex2 A] Mean Abs: 1.319 | Max: 5.583 [LOSS Ex2] A: 0.25801 | B: 0.43564 | C: 0.35019 ** [JOINT LOSS] ** : 1.022298 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004215 | Grad Max: 0.098572 -> Layer: shared_layers.0.bias | Grad Mean: 0.251763 | Grad Max: 1.269976 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001880 | Grad Max: 0.008039 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002330 | Grad Max: 0.002330 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001733 | Grad Max: 0.172768 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032721 | Grad Max: 0.991341 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.010386 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015295 | Grad Max: 0.061105 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000779 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003378 | Grad Max: 0.007820 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000321 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001088 | Grad Max: 0.002809 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002083 | Grad Max: 0.004047 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032596 | Grad Max: 0.032596 [GRADIENT NORM TOTAL] 5.0108 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.326 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.56867164 0.43132836] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 297/1751 | B: 310/1738 | C: 266/1782 [LOSS Ex1] A: 0.67429 | B: 0.67636 | C: 0.67139 [LOGITS Ex2 A] Mean Abs: 1.372 | Max: 5.176 [LOSS Ex2] A: 0.24117 | B: 0.43063 | C: 0.35251 ** [JOINT LOSS] ** : 1.015449 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004670 | Grad Max: 0.119356 -> Layer: shared_layers.0.bias | Grad Mean: 0.317058 | Grad Max: 1.596569 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.009399 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013945 | Grad Max: 0.013945 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.179790 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040420 | Grad Max: 1.033652 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000475 | Grad Max: 0.010916 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018872 | Grad Max: 0.066608 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000918 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004146 | Grad Max: 0.009244 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000394 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001328 | Grad Max: 0.003347 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002391 | Grad Max: 0.004545 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038884 | Grad Max: 0.038884 [GRADIENT NORM TOTAL] 6.1083 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.364 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022282 0.49777183] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 284/1764 | B: 310/1738 | C: 277/1771 [LOSS Ex1] A: 0.67699 | B: 0.67384 | C: 0.67001 [LOGITS Ex2 A] Mean Abs: 1.388 | Max: 5.058 [LOSS Ex2] A: 0.24934 | B: 0.40255 | C: 0.35273 ** [JOINT LOSS] ** : 1.008489 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003633 | Grad Max: 0.091492 -> Layer: shared_layers.0.bias | Grad Mean: 0.150032 | Grad Max: 0.687604 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001971 | Grad Max: 0.007895 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004663 | Grad Max: 0.004663 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001053 | Grad Max: 0.101174 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019229 | Grad Max: 0.563323 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000216 | Grad Max: 0.005177 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008510 | Grad Max: 0.029463 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000436 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001952 | Grad Max: 0.004831 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000211 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000622 | Grad Max: 0.001749 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001212 | Grad Max: 0.003418 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018407 | Grad Max: 0.018407 [GRADIENT NORM TOTAL] 2.8285 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.175 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5638724 0.43612763] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 287/1761 | B: 276/1580 | C: 239/1809 [LOSS Ex1] A: 0.67543 | B: 0.67631 | C: 0.67337 [LOGITS Ex2 A] Mean Abs: 1.424 | Max: 5.341 [LOSS Ex2] A: 0.27251 | B: 0.39728 | C: 0.37043 ** [JOINT LOSS] ** : 1.021776 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004780 | Grad Max: 0.098255 -> Layer: shared_layers.0.bias | Grad Mean: 0.226698 | Grad Max: 1.146889 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001827 | Grad Max: 0.007781 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003619 | Grad Max: 0.003619 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001611 | Grad Max: 0.105147 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030089 | Grad Max: 0.596413 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000356 | Grad Max: 0.009537 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013985 | Grad Max: 0.051011 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000649 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003094 | Grad Max: 0.007253 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000280 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001003 | Grad Max: 0.002444 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001901 | Grad Max: 0.004105 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030545 | Grad Max: 0.030545 [GRADIENT NORM TOTAL] 4.3652 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.233 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5387793 0.46122068] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 229/1387 | B: 291/1757 | C: 262/1786 [LOSS Ex1] A: 0.67449 | B: 0.67582 | C: 0.67081 [LOGITS Ex2 A] Mean Abs: 1.467 | Max: 5.325 [LOSS Ex2] A: 0.24818 | B: 0.43560 | C: 0.33529 ** [JOINT LOSS] ** : 1.013395 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005017 | Grad Max: 0.110944 -> Layer: shared_layers.0.bias | Grad Mean: 0.277218 | Grad Max: 1.376778 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.008223 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001014 | Grad Max: 0.001014 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001899 | Grad Max: 0.136014 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036064 | Grad Max: 0.714299 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000425 | Grad Max: 0.009987 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016811 | Grad Max: 0.060596 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000781 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003716 | Grad Max: 0.008453 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000350 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001196 | Grad Max: 0.002951 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002094 | Grad Max: 0.004766 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035697 | Grad Max: 0.035697 [GRADIENT NORM TOTAL] 5.2634 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.365 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072783 0.49272168] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 311/1737 | B: 310/1738 | C: 271/1777 [LOSS Ex1] A: 0.67472 | B: 0.67626 | C: 0.66999 [LOGITS Ex2 A] Mean Abs: 1.425 | Max: 6.859 [LOSS Ex2] A: 0.26198 | B: 0.42402 | C: 0.33133 ** [JOINT LOSS] ** : 1.012771 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002676 | Grad Max: 0.058524 -> Layer: shared_layers.0.bias | Grad Mean: 0.121225 | Grad Max: 0.571884 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001985 | Grad Max: 0.008311 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005951 | Grad Max: 0.005951 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000852 | Grad Max: 0.090465 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015485 | Grad Max: 0.496991 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000171 | Grad Max: 0.005441 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006708 | Grad Max: 0.027822 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000442 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001496 | Grad Max: 0.003816 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000187 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000492 | Grad Max: 0.001439 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000831 | Grad Max: 0.002646 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014250 | Grad Max: 0.014250 [GRADIENT NORM TOTAL] 2.3998 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.342 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.503341 0.49665898] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 307/1741 | B: 313/1735 | C: 243/1805 [LOSS Ex1] A: 0.67388 | B: 0.67374 | C: 0.67163 [LOGITS Ex2 A] Mean Abs: 1.377 | Max: 5.350 [LOSS Ex2] A: 0.25314 | B: 0.41391 | C: 0.36026 ** [JOINT LOSS] ** : 1.015521 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004596 | Grad Max: 0.105901 -> Layer: shared_layers.0.bias | Grad Mean: 0.300405 | Grad Max: 1.397904 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.009351 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009870 | Grad Max: 0.009870 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.175927 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037469 | Grad Max: 0.999396 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000447 | Grad Max: 0.012439 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017787 | Grad Max: 0.068812 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000826 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003918 | Grad Max: 0.008572 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000324 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001264 | Grad Max: 0.002979 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002409 | Grad Max: 0.004284 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037915 | Grad Max: 0.037915 [GRADIENT NORM TOTAL] 5.6711 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.352 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50359046 0.49640954] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 295/1753 | B: 277/1579 | C: 258/1790 [LOSS Ex1] A: 0.67298 | B: 0.67622 | C: 0.67198 [LOGITS Ex2 A] Mean Abs: 1.362 | Max: 6.043 [LOSS Ex2] A: 0.27259 | B: 0.41412 | C: 0.35946 ** [JOINT LOSS] ** : 1.022453 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006318 | Grad Max: 0.157675 -> Layer: shared_layers.0.bias | Grad Mean: 0.424076 | Grad Max: 2.048934 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.008871 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006531 | Grad Max: 0.006531 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002796 | Grad Max: 0.222666 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053271 | Grad Max: 1.197317 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000631 | Grad Max: 0.015877 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025234 | Grad Max: 0.093247 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.001195 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005567 | Grad Max: 0.012951 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000499 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001794 | Grad Max: 0.004318 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003282 | Grad Max: 0.006186 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053190 | Grad Max: 0.053190 [GRADIENT NORM TOTAL] 8.0063 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.299 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50650173 0.4934983 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 272/1776 | B: 291/1757 | C: 248/1800 [LOSS Ex1] A: 0.67642 | B: 0.67572 | C: 0.67179 [LOGITS Ex2 A] Mean Abs: 1.371 | Max: 5.231 [LOSS Ex2] A: 0.25511 | B: 0.42846 | C: 0.36349 ** [JOINT LOSS] ** : 1.023662 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005319 | Grad Max: 0.132021 -> Layer: shared_layers.0.bias | Grad Mean: 0.253097 | Grad Max: 1.221457 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001838 | Grad Max: 0.007329 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000115 | Grad Max: 0.000115 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001850 | Grad Max: 0.176570 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034683 | Grad Max: 1.011572 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000406 | Grad Max: 0.010138 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015885 | Grad Max: 0.054005 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000723 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003528 | Grad Max: 0.007554 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000335 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001144 | Grad Max: 0.002893 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002337 | Grad Max: 0.004288 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034978 | Grad Max: 0.034978 [GRADIENT NORM TOTAL] 5.0353 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.206 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5174494 0.48255062] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 278/1770 | B: 310/1738 | C: 278/1770 [LOSS Ex1] A: 0.67595 | B: 0.67617 | C: 0.67084 [LOGITS Ex2 A] Mean Abs: 1.383 | Max: 5.632 [LOSS Ex2] A: 0.25012 | B: 0.42498 | C: 0.35776 ** [JOINT LOSS] ** : 1.018607 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003141 | Grad Max: 0.075057 -> Layer: shared_layers.0.bias | Grad Mean: 0.205128 | Grad Max: 1.029907 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001870 | Grad Max: 0.007591 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001880 | Grad Max: 0.001880 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001406 | Grad Max: 0.120069 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026368 | Grad Max: 0.682591 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000296 | Grad Max: 0.007750 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011690 | Grad Max: 0.043398 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000643 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002584 | Grad Max: 0.006291 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000264 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000839 | Grad Max: 0.002127 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001442 | Grad Max: 0.003539 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024708 | Grad Max: 0.024708 [GRADIENT NORM TOTAL] 4.0505 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.329 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5694984 0.43050155] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 307/1741 | B: 313/1735 | C: 244/1804 [LOSS Ex1] A: 0.67410 | B: 0.67364 | C: 0.67315 [LOGITS Ex2 A] Mean Abs: 1.438 | Max: 5.274 [LOSS Ex2] A: 0.25662 | B: 0.41473 | C: 0.36739 ** [JOINT LOSS] ** : 1.019874 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007380 | Grad Max: 0.175377 -> Layer: shared_layers.0.bias | Grad Mean: 0.394464 | Grad Max: 2.023442 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.009640 -> Layer: exit1_layers.0.bias | Grad Mean: 0.018375 | Grad Max: 0.018375 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002792 | Grad Max: 0.209177 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052876 | Grad Max: 1.116848 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000617 | Grad Max: 0.014735 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024419 | Grad Max: 0.083974 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001070 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005410 | Grad Max: 0.011620 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000492 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001748 | Grad Max: 0.004137 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003246 | Grad Max: 0.006276 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052262 | Grad Max: 0.052262 [GRADIENT NORM TOTAL] 7.6012 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.367 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022001 0.49779987] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 288/1760 | B: 279/1577 | C: 248/1800 [LOSS Ex1] A: 0.67682 | B: 0.67613 | C: 0.67196 [LOGITS Ex2 A] Mean Abs: 1.437 | Max: 5.066 [LOSS Ex2] A: 0.25629 | B: 0.40237 | C: 0.35751 ** [JOINT LOSS] ** : 1.013692 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003048 | Grad Max: 0.087374 -> Layer: shared_layers.0.bias | Grad Mean: 0.204920 | Grad Max: 1.014286 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001981 | Grad Max: 0.008909 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013838 | Grad Max: 0.013838 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001380 | Grad Max: 0.131426 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026347 | Grad Max: 0.730105 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000317 | Grad Max: 0.009215 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012673 | Grad Max: 0.052971 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000713 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002770 | Grad Max: 0.006904 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000267 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000900 | Grad Max: 0.002306 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001676 | Grad Max: 0.003853 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027774 | Grad Max: 0.027774 [GRADIENT NORM TOTAL] 3.9114 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.177 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5645635 0.43543643] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 292/1756 | B: 296/1752 | C: 259/1789 [LOSS Ex1] A: 0.67525 | B: 0.67563 | C: 0.67159 [LOGITS Ex2 A] Mean Abs: 1.384 | Max: 5.638 [LOSS Ex2] A: 0.26683 | B: 0.43003 | C: 0.38139 ** [JOINT LOSS] ** : 1.033572 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003615 | Grad Max: 0.100221 -> Layer: shared_layers.0.bias | Grad Mean: 0.251046 | Grad Max: 1.222869 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001931 | Grad Max: 0.008339 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002712 | Grad Max: 0.002712 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001744 | Grad Max: 0.236627 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032821 | Grad Max: 1.338515 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000384 | Grad Max: 0.010575 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015328 | Grad Max: 0.060855 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000740 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003400 | Grad Max: 0.007583 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000317 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001109 | Grad Max: 0.002717 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002131 | Grad Max: 0.004458 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034081 | Grad Max: 0.034081 [GRADIENT NORM TOTAL] 5.0880 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.236 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5392329 0.46076712] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 232/1384 | B: 310/1738 | C: 145/1231 [LOSS Ex1] A: 0.67430 | B: 0.67608 | C: 0.67393 [LOGITS Ex2 A] Mean Abs: 1.414 | Max: 5.590 [LOSS Ex2] A: 0.25225 | B: 0.42892 | C: 0.35771 ** [JOINT LOSS] ** : 1.021063 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006223 | Grad Max: 0.143286 -> Layer: shared_layers.0.bias | Grad Mean: 0.352821 | Grad Max: 1.773617 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001974 | Grad Max: 0.008946 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009917 | Grad Max: 0.009917 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002514 | Grad Max: 0.255859 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046884 | Grad Max: 1.435117 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000553 | Grad Max: 0.013710 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021849 | Grad Max: 0.079247 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000996 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004853 | Grad Max: 0.010558 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000451 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001574 | Grad Max: 0.003880 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003119 | Grad Max: 0.005440 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048455 | Grad Max: 0.048455 [GRADIENT NORM TOTAL] 6.9134 [EPOCH SUMMARY] Train Loss: 1.0188 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9894 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9924 -> New: 0.9894) ############################## EPOCH 46/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.368 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072664 0.4927336] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 317/1731 | B: 317/1731 | C: 269/1779 [LOSS Ex1] A: 0.67454 | B: 0.67355 | C: 0.67131 [LOGITS Ex2 A] Mean Abs: 1.405 | Max: 5.177 [LOSS Ex2] A: 0.25235 | B: 0.40569 | C: 0.37168 ** [JOINT LOSS] ** : 1.016375 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004618 | Grad Max: 0.105803 -> Layer: shared_layers.0.bias | Grad Mean: 0.188473 | Grad Max: 0.967525 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002004 | Grad Max: 0.008049 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002995 | Grad Max: 0.002995 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001339 | Grad Max: 0.085560 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025360 | Grad Max: 0.434791 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.007593 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012032 | Grad Max: 0.043556 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000574 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002666 | Grad Max: 0.005975 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000257 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000859 | Grad Max: 0.002332 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001651 | Grad Max: 0.003213 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026032 | Grad Max: 0.026032 [GRADIENT NORM TOTAL] 3.5397 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.345 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5033882 0.49661183] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 317/1731 | B: 280/1576 | C: 273/1775 [LOSS Ex1] A: 0.67370 | B: 0.67604 | C: 0.66994 [LOGITS Ex2 A] Mean Abs: 1.450 | Max: 5.198 [LOSS Ex2] A: 0.25154 | B: 0.39985 | C: 0.34133 ** [JOINT LOSS] ** : 1.004133 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005498 | Grad Max: 0.122706 -> Layer: shared_layers.0.bias | Grad Mean: 0.276671 | Grad Max: 1.352551 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.009625 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014805 | Grad Max: 0.014805 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001974 | Grad Max: 0.160366 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037314 | Grad Max: 0.877789 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000445 | Grad Max: 0.011218 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017706 | Grad Max: 0.061429 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000913 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003919 | Grad Max: 0.009880 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000355 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001269 | Grad Max: 0.003066 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002316 | Grad Max: 0.005072 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038344 | Grad Max: 0.038344 [GRADIENT NORM TOTAL] 5.3686 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.354 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50354743 0.49645257] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 304/1744 | B: 296/1752 | C: 240/1808 [LOSS Ex1] A: 0.67280 | B: 0.67554 | C: 0.67330 [LOGITS Ex2 A] Mean Abs: 1.457 | Max: 5.993 [LOSS Ex2] A: 0.28309 | B: 0.43823 | C: 0.36407 ** [JOINT LOSS] ** : 1.035675 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008728 | Grad Max: 0.232486 -> Layer: shared_layers.0.bias | Grad Mean: 0.424894 | Grad Max: 2.043810 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.009360 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011840 | Grad Max: 0.011840 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003075 | Grad Max: 0.228928 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057388 | Grad Max: 1.174939 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000669 | Grad Max: 0.016778 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026406 | Grad Max: 0.093903 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001355 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005881 | Grad Max: 0.013978 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000513 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001897 | Grad Max: 0.004686 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003439 | Grad Max: 0.006326 -> Layer: exit2_layers.12.bias | Grad Mean: 0.056024 | Grad Max: 0.056024 [GRADIENT NORM TOTAL] 8.1254 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.301 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5064656 0.49353442] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 279/1769 | B: 309/1739 | C: 247/1801 [LOSS Ex1] A: 0.67626 | B: 0.67599 | C: 0.67262 [LOGITS Ex2 A] Mean Abs: 1.422 | Max: 5.299 [LOSS Ex2] A: 0.25602 | B: 0.42629 | C: 0.36295 ** [JOINT LOSS] ** : 1.023380 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004604 | Grad Max: 0.121041 -> Layer: shared_layers.0.bias | Grad Mean: 0.228361 | Grad Max: 1.083431 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001811 | Grad Max: 0.007621 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003604 | Grad Max: 0.003604 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001565 | Grad Max: 0.143703 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029542 | Grad Max: 0.760579 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000346 | Grad Max: 0.009138 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013792 | Grad Max: 0.051761 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000771 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003047 | Grad Max: 0.008375 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000283 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000982 | Grad Max: 0.002373 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001663 | Grad Max: 0.003926 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028172 | Grad Max: 0.028172 [GRADIENT NORM TOTAL] 4.3458 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.050 | Max: 0.208 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51759434 0.48240563] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.021 [MASKS] A(Pass/Fail): 283/1765 | B: 318/1730 | C: 264/1784 [LOSS Ex1] A: 0.67580 | B: 0.67347 | C: 0.66997 [LOGITS Ex2 A] Mean Abs: 1.336 | Max: 5.603 [LOSS Ex2] A: 0.26627 | B: 0.41522 | C: 0.34053 ** [JOINT LOSS] ** : 1.013753 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003054 | Grad Max: 0.070023 -> Layer: shared_layers.0.bias | Grad Mean: 0.213544 | Grad Max: 0.997498 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.007695 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002006 | Grad Max: 0.002006 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001440 | Grad Max: 0.147676 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027399 | Grad Max: 0.814221 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000321 | Grad Max: 0.009361 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012849 | Grad Max: 0.050959 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000577 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002812 | Grad Max: 0.006554 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000288 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000892 | Grad Max: 0.002406 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001590 | Grad Max: 0.003676 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025028 | Grad Max: 0.025028 [GRADIENT NORM TOTAL] 4.1497 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.331 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57030034 0.42969963] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 312/1736 | B: 281/1575 | C: 252/1796 [LOSS Ex1] A: 0.67392 | B: 0.67596 | C: 0.67111 [LOGITS Ex2 A] Mean Abs: 1.380 | Max: 5.511 [LOSS Ex2] A: 0.24704 | B: 0.39979 | C: 0.36876 ** [JOINT LOSS] ** : 1.012194 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005299 | Grad Max: 0.113820 -> Layer: shared_layers.0.bias | Grad Mean: 0.330200 | Grad Max: 1.589133 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.009200 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013475 | Grad Max: 0.013475 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002264 | Grad Max: 0.204533 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042764 | Grad Max: 1.181630 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000502 | Grad Max: 0.012764 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020061 | Grad Max: 0.078161 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000927 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004428 | Grad Max: 0.010491 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000417 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001417 | Grad Max: 0.003523 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002551 | Grad Max: 0.005151 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041294 | Grad Max: 0.041294 [GRADIENT NORM TOTAL] 6.3868 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.370 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50215673 0.49784324] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 290/1758 | B: 297/1751 | C: 236/1812 [LOSS Ex1] A: 0.67667 | B: 0.67546 | C: 0.67283 [LOGITS Ex2 A] Mean Abs: 1.400 | Max: 5.739 [LOSS Ex2] A: 0.24205 | B: 0.42885 | C: 0.34979 ** [JOINT LOSS] ** : 1.015214 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004814 | Grad Max: 0.147848 -> Layer: shared_layers.0.bias | Grad Mean: 0.207582 | Grad Max: 0.991924 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001858 | Grad Max: 0.007881 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006234 | Grad Max: 0.006234 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001540 | Grad Max: 0.146271 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027808 | Grad Max: 0.823186 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.006904 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012042 | Grad Max: 0.039378 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000636 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002715 | Grad Max: 0.006385 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000271 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000878 | Grad Max: 0.002383 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001708 | Grad Max: 0.003615 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026195 | Grad Max: 0.026195 [GRADIENT NORM TOTAL] 4.0929 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.179 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5652136 0.4347864] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 295/1753 | B: 311/1737 | C: 244/1804 [LOSS Ex1] A: 0.67508 | B: 0.67591 | C: 0.67162 [LOGITS Ex2 A] Mean Abs: 1.445 | Max: 5.056 [LOSS Ex2] A: 0.25985 | B: 0.42863 | C: 0.35235 ** [JOINT LOSS] ** : 1.021146 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003529 | Grad Max: 0.085187 -> Layer: shared_layers.0.bias | Grad Mean: 0.230217 | Grad Max: 1.092062 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001917 | Grad Max: 0.008252 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006073 | Grad Max: 0.006073 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001522 | Grad Max: 0.126594 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028715 | Grad Max: 0.717857 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000334 | Grad Max: 0.008006 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013361 | Grad Max: 0.051678 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000731 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002943 | Grad Max: 0.007015 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000319 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000951 | Grad Max: 0.002442 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001663 | Grad Max: 0.003554 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028015 | Grad Max: 0.028015 [GRADIENT NORM TOTAL] 4.4132 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.238 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5396543 0.46034566] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 237/1379 | B: 322/1726 | C: 248/1800 [LOSS Ex1] A: 0.67414 | B: 0.67338 | C: 0.67104 [LOGITS Ex2 A] Mean Abs: 1.485 | Max: 5.845 [LOSS Ex2] A: 0.24344 | B: 0.40546 | C: 0.35504 ** [JOINT LOSS] ** : 1.007498 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005522 | Grad Max: 0.127247 -> Layer: shared_layers.0.bias | Grad Mean: 0.338266 | Grad Max: 1.690731 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.008307 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001396 | Grad Max: 0.001396 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002307 | Grad Max: 0.171253 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043952 | Grad Max: 0.930888 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000515 | Grad Max: 0.014357 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020572 | Grad Max: 0.077374 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000933 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004533 | Grad Max: 0.009982 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000415 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001463 | Grad Max: 0.003555 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002524 | Grad Max: 0.005302 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042983 | Grad Max: 0.042983 [GRADIENT NORM TOTAL] 6.5079 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.371 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072278 0.4927722] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 321/1727 | B: 283/1573 | C: 243/1805 [LOSS Ex1] A: 0.67437 | B: 0.67588 | C: 0.67337 [LOGITS Ex2 A] Mean Abs: 1.453 | Max: 6.612 [LOSS Ex2] A: 0.25437 | B: 0.39795 | C: 0.36614 ** [JOINT LOSS] ** : 1.014029 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003003 | Grad Max: 0.062745 -> Layer: shared_layers.0.bias | Grad Mean: 0.152884 | Grad Max: 0.763390 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.009104 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014919 | Grad Max: 0.014919 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001117 | Grad Max: 0.101677 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020937 | Grad Max: 0.574762 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000244 | Grad Max: 0.006588 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009695 | Grad Max: 0.039439 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000531 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002124 | Grad Max: 0.005960 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000202 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000691 | Grad Max: 0.001844 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001220 | Grad Max: 0.002947 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020531 | Grad Max: 0.020531 [GRADIENT NORM TOTAL] 3.0515 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.347 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034351 0.4965649] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 322/1726 | B: 297/1751 | C: 261/1787 [LOSS Ex1] A: 0.67351 | B: 0.67537 | C: 0.67190 [LOGITS Ex2 A] Mean Abs: 1.405 | Max: 6.166 [LOSS Ex2] A: 0.24846 | B: 0.43834 | C: 0.36256 ** [JOINT LOSS] ** : 1.023379 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004937 | Grad Max: 0.110548 -> Layer: shared_layers.0.bias | Grad Mean: 0.301164 | Grad Max: 1.468276 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.009173 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010930 | Grad Max: 0.010930 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002052 | Grad Max: 0.171031 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038863 | Grad Max: 0.961129 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000453 | Grad Max: 0.012218 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018174 | Grad Max: 0.069477 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000844 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004023 | Grad Max: 0.008640 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000358 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001297 | Grad Max: 0.003253 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002369 | Grad Max: 0.004371 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038181 | Grad Max: 0.038181 [GRADIENT NORM TOTAL] 5.7454 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.356 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50348884 0.49651122] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 309/1739 | B: 312/1736 | C: 265/1783 [LOSS Ex1] A: 0.67261 | B: 0.67583 | C: 0.67043 [LOGITS Ex2 A] Mean Abs: 1.369 | Max: 6.297 [LOSS Ex2] A: 0.26408 | B: 0.43337 | C: 0.34261 ** [JOINT LOSS] ** : 1.019642 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006215 | Grad Max: 0.149591 -> Layer: shared_layers.0.bias | Grad Mean: 0.399671 | Grad Max: 1.971603 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.009170 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008782 | Grad Max: 0.008782 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002673 | Grad Max: 0.239605 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051014 | Grad Max: 1.293868 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000609 | Grad Max: 0.014650 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024400 | Grad Max: 0.085105 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001045 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005353 | Grad Max: 0.012195 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000500 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001721 | Grad Max: 0.004411 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003033 | Grad Max: 0.005299 -> Layer: exit2_layers.12.bias | Grad Mean: 0.049857 | Grad Max: 0.049857 [GRADIENT NORM TOTAL] 7.5749 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.303 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5064067 0.4935933] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 286/1762 | B: 323/1725 | C: 282/1766 [LOSS Ex1] A: 0.67611 | B: 0.67329 | C: 0.66907 [LOGITS Ex2 A] Mean Abs: 1.373 | Max: 4.975 [LOSS Ex2] A: 0.25077 | B: 0.41359 | C: 0.35777 ** [JOINT LOSS] ** : 1.013535 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004309 | Grad Max: 0.103400 -> Layer: shared_layers.0.bias | Grad Mean: 0.250803 | Grad Max: 1.265411 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001990 | Grad Max: 0.007415 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002581 | Grad Max: 0.002581 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001719 | Grad Max: 0.107579 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032408 | Grad Max: 0.587134 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.009230 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015464 | Grad Max: 0.051707 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000739 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003408 | Grad Max: 0.007936 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000351 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001092 | Grad Max: 0.002825 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001983 | Grad Max: 0.004199 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031897 | Grad Max: 0.031897 [GRADIENT NORM TOTAL] 4.7577 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51775974 0.48224023] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 290/1758 | B: 284/1572 | C: 186/1190 [LOSS Ex1] A: 0.67565 | B: 0.67580 | C: 0.66849 [LOGITS Ex2 A] Mean Abs: 1.371 | Max: 5.331 [LOSS Ex2] A: 0.25910 | B: 0.39376 | C: 0.37233 ** [JOINT LOSS] ** : 1.015040 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004745 | Grad Max: 0.107869 -> Layer: shared_layers.0.bias | Grad Mean: 0.215026 | Grad Max: 1.015331 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001969 | Grad Max: 0.007903 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000846 | Grad Max: 0.000846 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001540 | Grad Max: 0.141289 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029052 | Grad Max: 0.760325 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000331 | Grad Max: 0.009572 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013242 | Grad Max: 0.055511 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000695 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002958 | Grad Max: 0.007031 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000276 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000964 | Grad Max: 0.002354 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001697 | Grad Max: 0.003875 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028640 | Grad Max: 0.028640 [GRADIENT NORM TOTAL] 4.1170 [EPOCH SUMMARY] Train Loss: 1.0168 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9948 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 47/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.333 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5711283 0.42887172] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.023 [MASKS] A(Pass/Fail): 320/1728 | B: 298/1750 | C: 249/1799 [LOSS Ex1] A: 0.67374 | B: 0.67528 | C: 0.67113 [LOGITS Ex2 A] Mean Abs: 1.425 | Max: 5.556 [LOSS Ex2] A: 0.24238 | B: 0.43608 | C: 0.34933 ** [JOINT LOSS] ** : 1.015980 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005941 | Grad Max: 0.137127 -> Layer: shared_layers.0.bias | Grad Mean: 0.307943 | Grad Max: 1.495190 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.009193 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014094 | Grad Max: 0.014094 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.163013 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040844 | Grad Max: 0.841758 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000478 | Grad Max: 0.011305 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018974 | Grad Max: 0.067237 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000813 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004226 | Grad Max: 0.009294 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000348 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001367 | Grad Max: 0.003233 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002407 | Grad Max: 0.005207 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040638 | Grad Max: 0.040638 [GRADIENT NORM TOTAL] 5.8835 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.372 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5020884 0.4979116] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 296/1752 | B: 314/1734 | C: 243/1805 [LOSS Ex1] A: 0.67651 | B: 0.67574 | C: 0.67258 [LOGITS Ex2 A] Mean Abs: 1.414 | Max: 5.002 [LOSS Ex2] A: 0.24437 | B: 0.42117 | C: 0.35282 ** [JOINT LOSS] ** : 1.014396 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002632 | Grad Max: 0.061344 -> Layer: shared_layers.0.bias | Grad Mean: 0.160107 | Grad Max: 0.802911 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.008564 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014701 | Grad Max: 0.014701 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001163 | Grad Max: 0.114490 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021863 | Grad Max: 0.644134 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000255 | Grad Max: 0.006352 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010108 | Grad Max: 0.039018 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000568 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002213 | Grad Max: 0.005320 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000723 | Grad Max: 0.001840 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001205 | Grad Max: 0.003318 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021076 | Grad Max: 0.021076 [GRADIENT NORM TOTAL] 3.2526 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.181 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5659208 0.43407914] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.022 [MASKS] A(Pass/Fail): 301/1747 | B: 324/1724 | C: 246/1802 [LOSS Ex1] A: 0.67491 | B: 0.67320 | C: 0.67208 [LOGITS Ex2 A] Mean Abs: 1.389 | Max: 5.275 [LOSS Ex2] A: 0.26158 | B: 0.40960 | C: 0.35774 ** [JOINT LOSS] ** : 1.016368 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004582 | Grad Max: 0.098565 -> Layer: shared_layers.0.bias | Grad Mean: 0.275415 | Grad Max: 1.342393 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001976 | Grad Max: 0.008289 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002956 | Grad Max: 0.002956 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001914 | Grad Max: 0.205696 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036241 | Grad Max: 1.163772 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.009554 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016237 | Grad Max: 0.060123 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000791 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003594 | Grad Max: 0.008225 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000306 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001155 | Grad Max: 0.002777 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002113 | Grad Max: 0.004270 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033995 | Grad Max: 0.033995 [GRADIENT NORM TOTAL] 5.4673 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.240 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5401435 0.45985645] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 250/1366 | B: 287/1569 | C: 222/1826 [LOSS Ex1] A: 0.67396 | B: 0.67571 | C: 0.67283 [LOGITS Ex2 A] Mean Abs: 1.401 | Max: 5.368 [LOSS Ex2] A: 0.24645 | B: 0.41079 | C: 0.38543 ** [JOINT LOSS] ** : 1.021728 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007225 | Grad Max: 0.178924 -> Layer: shared_layers.0.bias | Grad Mean: 0.439408 | Grad Max: 2.140693 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001908 | Grad Max: 0.008466 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004332 | Grad Max: 0.004332 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003042 | Grad Max: 0.256063 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057400 | Grad Max: 1.413489 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000671 | Grad Max: 0.015464 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026826 | Grad Max: 0.094338 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001113 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005943 | Grad Max: 0.012916 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000509 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001919 | Grad Max: 0.004731 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003550 | Grad Max: 0.006578 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057614 | Grad Max: 0.057614 [GRADIENT NORM TOTAL] 8.4282 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.373 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50716627 0.49283376] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 328/1720 | B: 298/1750 | C: 253/1795 [LOSS Ex1] A: 0.67420 | B: 0.67520 | C: 0.67141 [LOGITS Ex2 A] Mean Abs: 1.403 | Max: 6.773 [LOSS Ex2] A: 0.24504 | B: 0.43235 | C: 0.35436 ** [JOINT LOSS] ** : 1.017521 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004827 | Grad Max: 0.127729 -> Layer: shared_layers.0.bias | Grad Mean: 0.228703 | Grad Max: 1.069963 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002006 | Grad Max: 0.008787 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008341 | Grad Max: 0.008341 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001650 | Grad Max: 0.183700 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031018 | Grad Max: 1.052038 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.009146 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014262 | Grad Max: 0.052186 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000747 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003186 | Grad Max: 0.007575 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000269 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001021 | Grad Max: 0.002441 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001826 | Grad Max: 0.003526 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029403 | Grad Max: 0.029403 [GRADIENT NORM TOTAL] 4.4379 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.349 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5035439 0.49645603] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 327/1721 | B: 314/1734 | C: 269/1779 [LOSS Ex1] A: 0.67333 | B: 0.67566 | C: 0.67057 [LOGITS Ex2 A] Mean Abs: 1.416 | Max: 5.352 [LOSS Ex2] A: 0.25629 | B: 0.42988 | C: 0.36235 ** [JOINT LOSS] ** : 1.022692 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003173 | Grad Max: 0.076531 -> Layer: shared_layers.0.bias | Grad Mean: 0.185400 | Grad Max: 0.878651 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002023 | Grad Max: 0.008642 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007309 | Grad Max: 0.007309 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001228 | Grad Max: 0.113957 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022869 | Grad Max: 0.621411 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000264 | Grad Max: 0.006942 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010652 | Grad Max: 0.040383 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000547 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002373 | Grad Max: 0.005628 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000234 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000765 | Grad Max: 0.001981 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001258 | Grad Max: 0.003060 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021613 | Grad Max: 0.021613 [GRADIENT NORM TOTAL] 3.5609 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.358 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034003 0.49659967] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 318/1730 | B: 325/1723 | C: 267/1781 [LOSS Ex1] A: 0.67243 | B: 0.67310 | C: 0.67099 [LOGITS Ex2 A] Mean Abs: 1.432 | Max: 5.559 [LOSS Ex2] A: 0.27132 | B: 0.40441 | C: 0.34487 ** [JOINT LOSS] ** : 1.012374 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007697 | Grad Max: 0.229520 -> Layer: shared_layers.0.bias | Grad Mean: 0.329675 | Grad Max: 1.581736 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002090 | Grad Max: 0.008834 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005878 | Grad Max: 0.005878 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002443 | Grad Max: 0.181983 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044936 | Grad Max: 0.934587 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000509 | Grad Max: 0.012427 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020066 | Grad Max: 0.068886 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000882 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004540 | Grad Max: 0.009924 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000407 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001484 | Grad Max: 0.003803 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002733 | Grad Max: 0.005554 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044430 | Grad Max: 0.044430 [GRADIENT NORM TOTAL] 6.3086 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.305 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063333 0.4936667] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 292/1756 | B: 287/1569 | C: 281/1767 [LOSS Ex1] A: 0.67596 | B: 0.67563 | C: 0.66925 [LOGITS Ex2 A] Mean Abs: 1.405 | Max: 5.079 [LOSS Ex2] A: 0.23907 | B: 0.40066 | C: 0.34874 ** [JOINT LOSS] ** : 1.003102 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002735 | Grad Max: 0.064078 -> Layer: shared_layers.0.bias | Grad Mean: 0.127255 | Grad Max: 0.627468 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001882 | Grad Max: 0.007408 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000818 | Grad Max: 0.000818 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000879 | Grad Max: 0.093627 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016284 | Grad Max: 0.517268 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.004893 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007246 | Grad Max: 0.026702 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000407 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001602 | Grad Max: 0.004116 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000174 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000521 | Grad Max: 0.001495 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000829 | Grad Max: 0.003030 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015305 | Grad Max: 0.015305 [GRADIENT NORM TOTAL] 2.4578 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.210 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.517954 0.48204604] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 296/1752 | B: 299/1749 | C: 250/1798 [LOSS Ex1] A: 0.67549 | B: 0.67511 | C: 0.67161 [LOGITS Ex2 A] Mean Abs: 1.330 | Max: 5.506 [LOSS Ex2] A: 0.25337 | B: 0.43879 | C: 0.34701 ** [JOINT LOSS] ** : 1.020459 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004409 | Grad Max: 0.109617 -> Layer: shared_layers.0.bias | Grad Mean: 0.272121 | Grad Max: 1.393224 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001895 | Grad Max: 0.008151 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004433 | Grad Max: 0.004433 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.158829 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036255 | Grad Max: 0.903652 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000427 | Grad Max: 0.010420 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017105 | Grad Max: 0.061709 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000753 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003802 | Grad Max: 0.008513 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000341 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001221 | Grad Max: 0.002952 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002176 | Grad Max: 0.004136 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035401 | Grad Max: 0.035401 [GRADIENT NORM TOTAL] 5.4156 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.335 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5719583 0.42804173] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.023 [MASKS] A(Pass/Fail): 339/1709 | B: 315/1733 | C: 260/1788 [LOSS Ex1] A: 0.67355 | B: 0.67556 | C: 0.66997 [LOGITS Ex2 A] Mean Abs: 1.384 | Max: 5.409 [LOSS Ex2] A: 0.24085 | B: 0.42823 | C: 0.34514 ** [JOINT LOSS] ** : 1.011106 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006727 | Grad Max: 0.186652 -> Layer: shared_layers.0.bias | Grad Mean: 0.334908 | Grad Max: 1.664445 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.008799 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011413 | Grad Max: 0.011413 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002414 | Grad Max: 0.179130 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045309 | Grad Max: 1.008950 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000536 | Grad Max: 0.013988 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021370 | Grad Max: 0.082375 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000976 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004756 | Grad Max: 0.011118 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000422 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001525 | Grad Max: 0.003851 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002677 | Grad Max: 0.005452 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043801 | Grad Max: 0.043801 [GRADIENT NORM TOTAL] 6.4660 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.375 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50202525 0.49797478] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 302/1746 | B: 326/1722 | C: 239/1809 [LOSS Ex1] A: 0.67636 | B: 0.67301 | C: 0.67222 [LOGITS Ex2 A] Mean Abs: 1.391 | Max: 5.258 [LOSS Ex2] A: 0.24156 | B: 0.41169 | C: 0.36534 ** [JOINT LOSS] ** : 1.013392 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004795 | Grad Max: 0.131866 -> Layer: shared_layers.0.bias | Grad Mean: 0.155741 | Grad Max: 0.718826 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001923 | Grad Max: 0.007640 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004664 | Grad Max: 0.004664 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001167 | Grad Max: 0.078036 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021200 | Grad Max: 0.449484 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.006417 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009575 | Grad Max: 0.036347 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000458 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002162 | Grad Max: 0.004926 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000226 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000699 | Grad Max: 0.001844 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001369 | Grad Max: 0.003123 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020274 | Grad Max: 0.020274 [GRADIENT NORM TOTAL] 2.9190 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.182 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5666016 0.43339843] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 308/1740 | B: 289/1567 | C: 296/1752 [LOSS Ex1] A: 0.67475 | B: 0.67553 | C: 0.66786 [LOGITS Ex2 A] Mean Abs: 1.437 | Max: 5.701 [LOSS Ex2] A: 0.26849 | B: 0.40191 | C: 0.35160 ** [JOINT LOSS] ** : 1.013382 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004889 | Grad Max: 0.115872 -> Layer: shared_layers.0.bias | Grad Mean: 0.279501 | Grad Max: 1.405154 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002009 | Grad Max: 0.008673 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006974 | Grad Max: 0.006974 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.165337 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037456 | Grad Max: 0.916805 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000442 | Grad Max: 0.011678 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017706 | Grad Max: 0.064116 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000844 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003895 | Grad Max: 0.008377 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000361 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001268 | Grad Max: 0.003042 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002380 | Grad Max: 0.005021 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039345 | Grad Max: 0.039345 [GRADIENT NORM TOTAL] 5.4791 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.243 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5405743 0.45942572] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 259/1357 | B: 300/1748 | C: 263/1785 [LOSS Ex1] A: 0.67379 | B: 0.67501 | C: 0.67067 [LOGITS Ex2 A] Mean Abs: 1.490 | Max: 5.332 [LOSS Ex2] A: 0.25294 | B: 0.44794 | C: 0.34018 ** [JOINT LOSS] ** : 1.020178 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007281 | Grad Max: 0.159832 -> Layer: shared_layers.0.bias | Grad Mean: 0.409706 | Grad Max: 1.994413 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001976 | Grad Max: 0.008538 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002297 | Grad Max: 0.002297 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002835 | Grad Max: 0.215267 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053600 | Grad Max: 1.203458 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000629 | Grad Max: 0.017537 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025276 | Grad Max: 0.099692 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.001136 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005577 | Grad Max: 0.012054 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000474 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001792 | Grad Max: 0.004258 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003064 | Grad Max: 0.005671 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052229 | Grad Max: 0.052229 [GRADIENT NORM TOTAL] 7.8259 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.376 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50714207 0.49285793] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 332/1716 | B: 316/1732 | C: 180/1196 [LOSS Ex1] A: 0.67403 | B: 0.67547 | C: 0.66954 [LOGITS Ex2 A] Mean Abs: 1.454 | Max: 6.310 [LOSS Ex2] A: 0.26680 | B: 0.41666 | C: 0.36037 ** [JOINT LOSS] ** : 1.020955 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006174 | Grad Max: 0.167416 -> Layer: shared_layers.0.bias | Grad Mean: 0.304102 | Grad Max: 1.398295 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.008324 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006305 | Grad Max: 0.006305 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.191771 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040023 | Grad Max: 1.013485 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000458 | Grad Max: 0.011445 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018263 | Grad Max: 0.063587 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000722 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004072 | Grad Max: 0.008895 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000370 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001321 | Grad Max: 0.003278 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002346 | Grad Max: 0.004782 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038541 | Grad Max: 0.038541 [GRADIENT NORM TOTAL] 5.9161 [EPOCH SUMMARY] Train Loss: 1.0160 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9849 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9894 -> New: 0.9849) ############################## EPOCH 48/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.351 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50357544 0.49642453] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 336/1712 | B: 326/1722 | C: 262/1786 [LOSS Ex1] A: 0.67314 | B: 0.67291 | C: 0.66976 [LOGITS Ex2 A] Mean Abs: 1.417 | Max: 5.986 [LOSS Ex2] A: 0.23669 | B: 0.40817 | C: 0.35135 ** [JOINT LOSS] ** : 1.004006 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001835 | Grad Max: 0.043338 -> Layer: shared_layers.0.bias | Grad Mean: 0.121080 | Grad Max: 0.600969 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.008930 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007331 | Grad Max: 0.007331 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000856 | Grad Max: 0.082029 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015490 | Grad Max: 0.459672 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000177 | Grad Max: 0.005423 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007132 | Grad Max: 0.030806 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000400 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001567 | Grad Max: 0.004166 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000165 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000500 | Grad Max: 0.001442 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000928 | Grad Max: 0.002718 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014388 | Grad Max: 0.014388 [GRADIENT NORM TOTAL] 2.4248 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.361 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5033771 0.4966229] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 325/1723 | B: 290/1566 | C: 252/1796 [LOSS Ex1] A: 0.67224 | B: 0.67545 | C: 0.67123 [LOGITS Ex2 A] Mean Abs: 1.408 | Max: 5.765 [LOSS Ex2] A: 0.25493 | B: 0.39948 | C: 0.34442 ** [JOINT LOSS] ** : 1.005916 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003111 | Grad Max: 0.086394 -> Layer: shared_layers.0.bias | Grad Mean: 0.255682 | Grad Max: 1.178819 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.009468 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011129 | Grad Max: 0.011129 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001590 | Grad Max: 0.146565 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029698 | Grad Max: 0.842963 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000345 | Grad Max: 0.009369 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013969 | Grad Max: 0.056086 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000621 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003057 | Grad Max: 0.007283 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000312 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000966 | Grad Max: 0.002671 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001637 | Grad Max: 0.003960 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026782 | Grad Max: 0.026782 [GRADIENT NORM TOTAL] 4.8023 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.307 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50632185 0.49367815] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 298/1750 | B: 301/1747 | C: 245/1803 [LOSS Ex1] A: 0.67579 | B: 0.67492 | C: 0.67321 [LOGITS Ex2 A] Mean Abs: 1.382 | Max: 4.950 [LOSS Ex2] A: 0.24686 | B: 0.42431 | C: 0.34342 ** [JOINT LOSS] ** : 1.012836 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001810 | Grad Max: 0.040354 -> Layer: shared_layers.0.bias | Grad Mean: 0.084519 | Grad Max: 0.288024 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001845 | Grad Max: 0.007627 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006141 | Grad Max: 0.006141 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000626 | Grad Max: 0.093569 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011118 | Grad Max: 0.535572 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000115 | Grad Max: 0.003858 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004548 | Grad Max: 0.019903 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000322 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001020 | Grad Max: 0.002883 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000142 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000319 | Grad Max: 0.001003 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000634 | Grad Max: 0.002115 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008865 | Grad Max: 0.008865 [GRADIENT NORM TOTAL] 1.7811 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.051 | Max: 0.211 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51810944 0.48189062] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.520 | Std: 0.022 [MASKS] A(Pass/Fail): 307/1741 | B: 316/1732 | C: 261/1787 [LOSS Ex1] A: 0.67532 | B: 0.67538 | C: 0.67099 [LOGITS Ex2 A] Mean Abs: 1.392 | Max: 5.328 [LOSS Ex2] A: 0.26394 | B: 0.42462 | C: 0.35359 ** [JOINT LOSS] ** : 1.021281 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005299 | Grad Max: 0.122013 -> Layer: shared_layers.0.bias | Grad Mean: 0.268789 | Grad Max: 1.237447 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001851 | Grad Max: 0.007817 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002149 | Grad Max: 0.002149 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001874 | Grad Max: 0.146556 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034770 | Grad Max: 0.782862 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000400 | Grad Max: 0.009730 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015989 | Grad Max: 0.053467 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000755 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003581 | Grad Max: 0.007784 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000330 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001154 | Grad Max: 0.002876 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001988 | Grad Max: 0.004326 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033410 | Grad Max: 0.033410 [GRADIENT NORM TOTAL] 5.1613 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.338 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5728628 0.42713714] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.023 [MASKS] A(Pass/Fail): 347/1701 | B: 326/1722 | C: 276/1772 [LOSS Ex1] A: 0.67335 | B: 0.67280 | C: 0.66865 [LOGITS Ex2 A] Mean Abs: 1.437 | Max: 5.715 [LOSS Ex2] A: 0.24312 | B: 0.41571 | C: 0.35149 ** [JOINT LOSS] ** : 1.008374 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005161 | Grad Max: 0.133228 -> Layer: shared_layers.0.bias | Grad Mean: 0.356174 | Grad Max: 1.595729 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.008736 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006379 | Grad Max: 0.006379 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002413 | Grad Max: 0.193680 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045372 | Grad Max: 1.099101 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.012860 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021553 | Grad Max: 0.076402 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000949 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004771 | Grad Max: 0.010573 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000387 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001547 | Grad Max: 0.003746 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002712 | Grad Max: 0.005745 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045950 | Grad Max: 0.045950 [GRADIENT NORM TOTAL] 6.8349 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.378 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.501994 0.49800596] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 316/1732 | B: 290/1566 | C: 272/1776 [LOSS Ex1] A: 0.67617 | B: 0.67534 | C: 0.66954 [LOGITS Ex2 A] Mean Abs: 1.420 | Max: 5.023 [LOSS Ex2] A: 0.23913 | B: 0.39646 | C: 0.32575 ** [JOINT LOSS] ** : 0.994131 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002862 | Grad Max: 0.075379 -> Layer: shared_layers.0.bias | Grad Mean: 0.100342 | Grad Max: 0.420279 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.008575 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011295 | Grad Max: 0.011295 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000777 | Grad Max: 0.089578 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013429 | Grad Max: 0.505148 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.004431 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005495 | Grad Max: 0.024086 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000372 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001160 | Grad Max: 0.003639 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000140 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000385 | Grad Max: 0.001228 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000712 | Grad Max: 0.002590 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011919 | Grad Max: 0.011919 [GRADIENT NORM TOTAL] 2.0667 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.185 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5674179 0.43258202] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 315/1733 | B: 301/1747 | C: 257/1791 [LOSS Ex1] A: 0.67454 | B: 0.67481 | C: 0.67065 [LOGITS Ex2 A] Mean Abs: 1.394 | Max: 5.564 [LOSS Ex2] A: 0.25725 | B: 0.43894 | C: 0.35901 ** [JOINT LOSS] ** : 1.025068 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005987 | Grad Max: 0.145733 -> Layer: shared_layers.0.bias | Grad Mean: 0.340487 | Grad Max: 1.622887 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001966 | Grad Max: 0.008228 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006025 | Grad Max: 0.006025 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.245523 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043702 | Grad Max: 1.388818 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000504 | Grad Max: 0.011210 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020267 | Grad Max: 0.068426 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000927 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004479 | Grad Max: 0.010233 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000403 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001441 | Grad Max: 0.003566 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002627 | Grad Max: 0.005160 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042109 | Grad Max: 0.042109 [GRADIENT NORM TOTAL] 6.5320 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.246 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54112005 0.45887992] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.023 [MASKS] A(Pass/Fail): 266/1350 | B: 316/1732 | C: 257/1791 [LOSS Ex1] A: 0.67358 | B: 0.67527 | C: 0.67116 [LOGITS Ex2 A] Mean Abs: 1.400 | Max: 5.126 [LOSS Ex2] A: 0.25172 | B: 0.43402 | C: 0.38199 ** [JOINT LOSS] ** : 1.029249 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007664 | Grad Max: 0.181387 -> Layer: shared_layers.0.bias | Grad Mean: 0.469052 | Grad Max: 2.290030 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001941 | Grad Max: 0.007935 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002736 | Grad Max: 0.002736 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003184 | Grad Max: 0.276009 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060484 | Grad Max: 1.556705 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000712 | Grad Max: 0.017963 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028668 | Grad Max: 0.102595 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000106 | Grad Max: 0.001191 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006335 | Grad Max: 0.014011 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000519 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002058 | Grad Max: 0.004697 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003959 | Grad Max: 0.007817 -> Layer: exit2_layers.12.bias | Grad Mean: 0.063501 | Grad Max: 0.063501 [GRADIENT NORM TOTAL] 8.9677 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.379 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50710845 0.49289155] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.023 [MASKS] A(Pass/Fail): 339/1709 | B: 328/1720 | C: 244/1804 [LOSS Ex1] A: 0.67381 | B: 0.67269 | C: 0.67137 [LOGITS Ex2 A] Mean Abs: 1.422 | Max: 6.800 [LOSS Ex2] A: 0.24371 | B: 0.41329 | C: 0.36191 ** [JOINT LOSS] ** : 1.012262 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005222 | Grad Max: 0.119860 -> Layer: shared_layers.0.bias | Grad Mean: 0.235330 | Grad Max: 1.206516 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.008766 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010936 | Grad Max: 0.010936 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001756 | Grad Max: 0.216859 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033181 | Grad Max: 1.225959 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000377 | Grad Max: 0.008801 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015098 | Grad Max: 0.051123 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000756 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003366 | Grad Max: 0.007808 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000313 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001081 | Grad Max: 0.002635 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002009 | Grad Max: 0.003856 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031502 | Grad Max: 0.031502 [GRADIENT NORM TOTAL] 4.7631 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.354 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50364685 0.49635312] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 349/1699 | B: 294/1562 | C: 264/1784 [LOSS Ex1] A: 0.67291 | B: 0.67524 | C: 0.66935 [LOGITS Ex2 A] Mean Abs: 1.456 | Max: 5.766 [LOSS Ex2] A: 0.24059 | B: 0.39927 | C: 0.36433 ** [JOINT LOSS] ** : 1.007230 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003549 | Grad Max: 0.092070 -> Layer: shared_layers.0.bias | Grad Mean: 0.164763 | Grad Max: 0.806368 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.008846 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007803 | Grad Max: 0.007803 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001111 | Grad Max: 0.107016 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020739 | Grad Max: 0.538420 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000238 | Grad Max: 0.006250 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009528 | Grad Max: 0.033198 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000493 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002116 | Grad Max: 0.004998 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000220 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000674 | Grad Max: 0.001720 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001059 | Grad Max: 0.002635 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018325 | Grad Max: 0.018325 [GRADIENT NORM TOTAL] 3.1462 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.363 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50328034 0.49671966] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 331/1717 | B: 303/1745 | C: 240/1808 [LOSS Ex1] A: 0.67201 | B: 0.67470 | C: 0.67196 [LOGITS Ex2 A] Mean Abs: 1.472 | Max: 6.283 [LOSS Ex2] A: 0.27475 | B: 0.42775 | C: 0.35753 ** [JOINT LOSS] ** : 1.026232 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007963 | Grad Max: 0.244065 -> Layer: shared_layers.0.bias | Grad Mean: 0.299226 | Grad Max: 1.429959 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002000 | Grad Max: 0.008626 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005784 | Grad Max: 0.005784 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.164164 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040105 | Grad Max: 0.809773 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.010760 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017758 | Grad Max: 0.059633 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000921 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004014 | Grad Max: 0.009551 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000357 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001283 | Grad Max: 0.003265 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002250 | Grad Max: 0.004287 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036635 | Grad Max: 0.036635 [GRADIENT NORM TOTAL] 5.6638 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.309 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062355 0.49376455] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 311/1737 | B: 316/1732 | C: 233/1815 [LOSS Ex1] A: 0.67561 | B: 0.67516 | C: 0.67115 [LOGITS Ex2 A] Mean Abs: 1.415 | Max: 6.110 [LOSS Ex2] A: 0.23993 | B: 0.41279 | C: 0.33798 ** [JOINT LOSS] ** : 1.004204 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003475 | Grad Max: 0.082888 -> Layer: shared_layers.0.bias | Grad Mean: 0.146103 | Grad Max: 0.635050 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001892 | Grad Max: 0.007723 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005645 | Grad Max: 0.005645 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000983 | Grad Max: 0.102319 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018022 | Grad Max: 0.536336 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000201 | Grad Max: 0.005774 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007999 | Grad Max: 0.030094 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000431 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001781 | Grad Max: 0.004341 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000220 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000582 | Grad Max: 0.001701 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000966 | Grad Max: 0.003244 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017062 | Grad Max: 0.017062 [GRADIENT NORM TOTAL] 2.7510 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.213 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51833415 0.48166585] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 316/1732 | B: 330/1718 | C: 261/1787 [LOSS Ex1] A: 0.67513 | B: 0.67258 | C: 0.67078 [LOGITS Ex2 A] Mean Abs: 1.350 | Max: 5.322 [LOSS Ex2] A: 0.25430 | B: 0.40586 | C: 0.34371 ** [JOINT LOSS] ** : 1.007449 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004681 | Grad Max: 0.102360 -> Layer: shared_layers.0.bias | Grad Mean: 0.279930 | Grad Max: 1.379538 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001936 | Grad Max: 0.007764 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000125 | Grad Max: 0.000125 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001917 | Grad Max: 0.234994 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036600 | Grad Max: 1.314459 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000420 | Grad Max: 0.010253 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017070 | Grad Max: 0.060058 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000749 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003772 | Grad Max: 0.008252 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000362 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001199 | Grad Max: 0.003149 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002180 | Grad Max: 0.004278 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034602 | Grad Max: 0.034602 [GRADIENT NORM TOTAL] 5.5036 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.340 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57389414 0.42610592] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.023 [MASKS] A(Pass/Fail): 354/1694 | B: 296/1560 | C: 195/1181 [LOSS Ex1] A: 0.67313 | B: 0.67513 | C: 0.66736 [LOGITS Ex2 A] Mean Abs: 1.392 | Max: 5.670 [LOSS Ex2] A: 0.23748 | B: 0.40828 | C: 0.34933 ** [JOINT LOSS] ** : 1.003569 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005729 | Grad Max: 0.116342 -> Layer: shared_layers.0.bias | Grad Mean: 0.354439 | Grad Max: 1.629878 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.009254 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012771 | Grad Max: 0.012771 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002383 | Grad Max: 0.237787 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044997 | Grad Max: 1.339518 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000526 | Grad Max: 0.013703 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021180 | Grad Max: 0.076581 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000976 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004684 | Grad Max: 0.010377 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000425 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001488 | Grad Max: 0.003873 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002598 | Grad Max: 0.005018 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042148 | Grad Max: 0.042148 [GRADIENT NORM TOTAL] 6.8622 [EPOCH SUMMARY] Train Loss: 1.0116 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9857 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 49/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.381 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5019112 0.4980888] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 321/1727 | B: 303/1745 | C: 250/1798 [LOSS Ex1] A: 0.67598 | B: 0.67459 | C: 0.67147 [LOGITS Ex2 A] Mean Abs: 1.426 | Max: 5.379 [LOSS Ex2] A: 0.23698 | B: 0.42669 | C: 0.34435 ** [JOINT LOSS] ** : 1.010024 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004422 | Grad Max: 0.145876 -> Layer: shared_layers.0.bias | Grad Mean: 0.183203 | Grad Max: 0.811981 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.008453 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011506 | Grad Max: 0.011506 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001291 | Grad Max: 0.114021 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023058 | Grad Max: 0.613790 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000262 | Grad Max: 0.006081 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010355 | Grad Max: 0.034552 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000508 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002370 | Grad Max: 0.005641 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000239 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000762 | Grad Max: 0.001961 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001491 | Grad Max: 0.003023 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022665 | Grad Max: 0.022665 [GRADIENT NORM TOTAL] 3.4220 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.187 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.56826067 0.4317393 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 321/1727 | B: 316/1732 | C: 258/1790 [LOSS Ex1] A: 0.67434 | B: 0.67505 | C: 0.67002 [LOGITS Ex2 A] Mean Abs: 1.461 | Max: 5.272 [LOSS Ex2] A: 0.25424 | B: 0.42857 | C: 0.35394 ** [JOINT LOSS] ** : 1.018719 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003143 | Grad Max: 0.098847 -> Layer: shared_layers.0.bias | Grad Mean: 0.291476 | Grad Max: 1.440985 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001928 | Grad Max: 0.007660 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001133 | Grad Max: 0.001133 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001918 | Grad Max: 0.155394 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036218 | Grad Max: 0.886967 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000414 | Grad Max: 0.011219 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016831 | Grad Max: 0.064059 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000733 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003689 | Grad Max: 0.008115 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000329 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001177 | Grad Max: 0.002950 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001988 | Grad Max: 0.004230 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033648 | Grad Max: 0.033648 [GRADIENT NORM TOTAL] 5.7523 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.248 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54166 0.45834005] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.023 [MASKS] A(Pass/Fail): 274/1342 | B: 332/1716 | C: 227/1821 [LOSS Ex1] A: 0.67338 | B: 0.67246 | C: 0.67345 [LOGITS Ex2 A] Mean Abs: 1.521 | Max: 5.480 [LOSS Ex2] A: 0.24799 | B: 0.41497 | C: 0.34281 ** [JOINT LOSS] ** : 1.008353 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005495 | Grad Max: 0.148288 -> Layer: shared_layers.0.bias | Grad Mean: 0.374406 | Grad Max: 1.865800 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001958 | Grad Max: 0.008304 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003088 | Grad Max: 0.003088 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002536 | Grad Max: 0.203690 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048059 | Grad Max: 1.115474 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000553 | Grad Max: 0.013723 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022465 | Grad Max: 0.084186 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.001016 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004938 | Grad Max: 0.010793 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000443 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001581 | Grad Max: 0.003747 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002808 | Grad Max: 0.005420 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046396 | Grad Max: 0.046396 [GRADIENT NORM TOTAL] 7.2566 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.382 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070578 0.49294224] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 346/1702 | B: 298/1558 | C: 247/1801 [LOSS Ex1] A: 0.67361 | B: 0.67503 | C: 0.67124 [LOGITS Ex2 A] Mean Abs: 1.478 | Max: 6.196 [LOSS Ex2] A: 0.25121 | B: 0.40171 | C: 0.34980 ** [JOINT LOSS] ** : 1.007534 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004129 | Grad Max: 0.097160 -> Layer: shared_layers.0.bias | Grad Mean: 0.264942 | Grad Max: 1.267545 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002014 | Grad Max: 0.008868 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012212 | Grad Max: 0.012212 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001804 | Grad Max: 0.148551 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034176 | Grad Max: 0.794081 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000396 | Grad Max: 0.010102 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015995 | Grad Max: 0.059016 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000682 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003517 | Grad Max: 0.007714 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000331 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001132 | Grad Max: 0.002867 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002003 | Grad Max: 0.004393 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033186 | Grad Max: 0.033186 [GRADIENT NORM TOTAL] 5.0804 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.356 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037253 0.49627474] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 360/1688 | B: 303/1745 | C: 277/1771 [LOSS Ex1] A: 0.67269 | B: 0.67449 | C: 0.66879 [LOGITS Ex2 A] Mean Abs: 1.426 | Max: 5.024 [LOSS Ex2] A: 0.23717 | B: 0.43048 | C: 0.34024 ** [JOINT LOSS] ** : 1.007953 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002951 | Grad Max: 0.072665 -> Layer: shared_layers.0.bias | Grad Mean: 0.155143 | Grad Max: 0.694074 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.008986 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007887 | Grad Max: 0.007887 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001082 | Grad Max: 0.140860 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020033 | Grad Max: 0.798523 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000226 | Grad Max: 0.005639 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009084 | Grad Max: 0.035319 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000485 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002027 | Grad Max: 0.005274 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000651 | Grad Max: 0.001726 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001259 | Grad Max: 0.002814 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019391 | Grad Max: 0.019391 [GRADIENT NORM TOTAL] 3.0595 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.366 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50322175 0.49677828] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 339/1709 | B: 317/1731 | C: 279/1769 [LOSS Ex1] A: 0.67178 | B: 0.67495 | C: 0.66835 [LOGITS Ex2 A] Mean Abs: 1.424 | Max: 5.301 [LOSS Ex2] A: 0.25091 | B: 0.42626 | C: 0.34325 ** [JOINT LOSS] ** : 1.011835 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003358 | Grad Max: 0.110652 -> Layer: shared_layers.0.bias | Grad Mean: 0.269989 | Grad Max: 1.307577 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.008733 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003176 | Grad Max: 0.003176 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001737 | Grad Max: 0.157471 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032943 | Grad Max: 0.899085 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.010563 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015887 | Grad Max: 0.064790 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000813 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003481 | Grad Max: 0.008576 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000315 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001113 | Grad Max: 0.002826 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002053 | Grad Max: 0.003763 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033173 | Grad Max: 0.033173 [GRADIENT NORM TOTAL] 5.1228 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.311 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061836 0.49381638] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 319/1729 | B: 335/1713 | C: 241/1807 [LOSS Ex1] A: 0.67542 | B: 0.67236 | C: 0.67179 [LOGITS Ex2 A] Mean Abs: 1.412 | Max: 5.554 [LOSS Ex2] A: 0.24160 | B: 0.41614 | C: 0.34362 ** [JOINT LOSS] ** : 1.006977 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002237 | Grad Max: 0.054095 -> Layer: shared_layers.0.bias | Grad Mean: 0.107348 | Grad Max: 0.487001 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001895 | Grad Max: 0.006897 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001095 | Grad Max: 0.001095 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000752 | Grad Max: 0.097512 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013722 | Grad Max: 0.551173 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000159 | Grad Max: 0.005537 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006342 | Grad Max: 0.025139 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000449 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001388 | Grad Max: 0.003964 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000160 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000442 | Grad Max: 0.001352 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000939 | Grad Max: 0.002515 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013870 | Grad Max: 0.013870 [GRADIENT NORM TOTAL] 2.0853 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.052 | Max: 0.214 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5185363 0.4814637] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 318/1730 | B: 300/1556 | C: 273/1775 [LOSS Ex1] A: 0.67495 | B: 0.67493 | C: 0.66839 [LOGITS Ex2 A] Mean Abs: 1.439 | Max: 5.222 [LOSS Ex2] A: 0.26271 | B: 0.40449 | C: 0.33393 ** [JOINT LOSS] ** : 1.006464 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007252 | Grad Max: 0.185921 -> Layer: shared_layers.0.bias | Grad Mean: 0.351470 | Grad Max: 1.634982 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001975 | Grad Max: 0.008419 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007019 | Grad Max: 0.007019 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002426 | Grad Max: 0.216047 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045859 | Grad Max: 1.128587 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000530 | Grad Max: 0.014215 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021363 | Grad Max: 0.074842 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000927 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004735 | Grad Max: 0.010152 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000416 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001523 | Grad Max: 0.003701 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002849 | Grad Max: 0.005520 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045476 | Grad Max: 0.045476 [GRADIENT NORM TOTAL] 6.6687 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.343 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.574838 0.42516208] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 358/1690 | B: 305/1743 | C: 253/1795 [LOSS Ex1] A: 0.67291 | B: 0.67438 | C: 0.67045 [LOGITS Ex2 A] Mean Abs: 1.486 | Max: 5.337 [LOSS Ex2] A: 0.26064 | B: 0.43947 | C: 0.36787 ** [JOINT LOSS] ** : 1.028577 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009681 | Grad Max: 0.239559 -> Layer: shared_layers.0.bias | Grad Mean: 0.449521 | Grad Max: 2.128288 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.009690 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017375 | Grad Max: 0.017375 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003166 | Grad Max: 0.257063 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059208 | Grad Max: 1.353118 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000689 | Grad Max: 0.016861 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027717 | Grad Max: 0.099589 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001310 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006190 | Grad Max: 0.014195 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000512 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001977 | Grad Max: 0.004492 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003556 | Grad Max: 0.006029 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057252 | Grad Max: 0.057252 [GRADIENT NORM TOTAL] 8.5015 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.384 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50186664 0.49813333] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.023 [MASKS] A(Pass/Fail): 326/1722 | B: 318/1730 | C: 290/1758 [LOSS Ex1] A: 0.67580 | B: 0.67485 | C: 0.66564 [LOGITS Ex2 A] Mean Abs: 1.460 | Max: 5.316 [LOSS Ex2] A: 0.23878 | B: 0.41892 | C: 0.35139 ** [JOINT LOSS] ** : 1.008461 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002857 | Grad Max: 0.091631 -> Layer: shared_layers.0.bias | Grad Mean: 0.246648 | Grad Max: 1.220353 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.007600 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000732 | Grad Max: 0.000732 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001636 | Grad Max: 0.140782 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030939 | Grad Max: 0.803134 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.011060 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014642 | Grad Max: 0.061118 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000658 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003168 | Grad Max: 0.006723 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000308 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001011 | Grad Max: 0.002407 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001743 | Grad Max: 0.003981 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029710 | Grad Max: 0.029710 [GRADIENT NORM TOTAL] 4.8182 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.189 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5690585 0.4309415] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 324/1724 | B: 336/1712 | C: 250/1798 [LOSS Ex1] A: 0.67414 | B: 0.67225 | C: 0.67176 [LOGITS Ex2 A] Mean Abs: 1.422 | Max: 5.127 [LOSS Ex2] A: 0.24532 | B: 0.40110 | C: 0.35459 ** [JOINT LOSS] ** : 1.006387 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003815 | Grad Max: 0.076548 -> Layer: shared_layers.0.bias | Grad Mean: 0.204464 | Grad Max: 0.998095 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001960 | Grad Max: 0.008163 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005307 | Grad Max: 0.005307 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001476 | Grad Max: 0.099600 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027758 | Grad Max: 0.577335 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.007975 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013379 | Grad Max: 0.046206 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000664 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002969 | Grad Max: 0.006802 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000286 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000951 | Grad Max: 0.002486 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001763 | Grad Max: 0.003617 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027690 | Grad Max: 0.027690 [GRADIENT NORM TOTAL] 3.9128 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.251 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54218817 0.4578119 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 275/1341 | B: 301/1555 | C: 258/1790 [LOSS Ex1] A: 0.67316 | B: 0.67483 | C: 0.66976 [LOGITS Ex2 A] Mean Abs: 1.445 | Max: 5.372 [LOSS Ex2] A: 0.23473 | B: 0.39777 | C: 0.36634 ** [JOINT LOSS] ** : 1.005534 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005472 | Grad Max: 0.131594 -> Layer: shared_layers.0.bias | Grad Mean: 0.315473 | Grad Max: 1.632601 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.008041 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000566 | Grad Max: 0.000566 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.164620 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041600 | Grad Max: 0.918911 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000489 | Grad Max: 0.011343 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019775 | Grad Max: 0.068297 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000913 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004368 | Grad Max: 0.009695 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000357 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001394 | Grad Max: 0.003198 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002548 | Grad Max: 0.004919 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040646 | Grad Max: 0.040646 [GRADIENT NORM TOTAL] 6.1378 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.385 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070553 0.4929447] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 351/1697 | B: 307/1741 | C: 245/1803 [LOSS Ex1] A: 0.67340 | B: 0.67429 | C: 0.67192 [LOGITS Ex2 A] Mean Abs: 1.453 | Max: 5.684 [LOSS Ex2] A: 0.24686 | B: 0.42572 | C: 0.36404 ** [JOINT LOSS] ** : 1.018739 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002223 | Grad Max: 0.051510 -> Layer: shared_layers.0.bias | Grad Mean: 0.116113 | Grad Max: 0.658866 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002008 | Grad Max: 0.008666 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011018 | Grad Max: 0.011018 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000853 | Grad Max: 0.122965 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015817 | Grad Max: 0.694469 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000175 | Grad Max: 0.006143 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007054 | Grad Max: 0.032651 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000379 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001554 | Grad Max: 0.003744 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000170 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000496 | Grad Max: 0.001517 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000926 | Grad Max: 0.002128 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014768 | Grad Max: 0.014768 [GRADIENT NORM TOTAL] 2.4642 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.359 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037591 0.49624097] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 366/1682 | B: 319/1729 | C: 181/1195 [LOSS Ex1] A: 0.67246 | B: 0.67475 | C: 0.66971 [LOGITS Ex2 A] Mean Abs: 1.476 | Max: 5.255 [LOSS Ex2] A: 0.25467 | B: 0.42795 | C: 0.33720 ** [JOINT LOSS] ** : 1.012245 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006526 | Grad Max: 0.155827 -> Layer: shared_layers.0.bias | Grad Mean: 0.317864 | Grad Max: 1.500377 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.009013 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010238 | Grad Max: 0.010238 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.210533 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041964 | Grad Max: 1.131443 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000478 | Grad Max: 0.011216 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019230 | Grad Max: 0.066830 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000858 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004280 | Grad Max: 0.009413 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000387 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001370 | Grad Max: 0.003415 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002370 | Grad Max: 0.005064 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039193 | Grad Max: 0.039193 [GRADIENT NORM TOTAL] 6.1274 [EPOCH SUMMARY] Train Loss: 1.0113 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9955 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 50/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.369 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5031915 0.49680853] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 344/1704 | B: 336/1712 | C: 274/1774 [LOSS Ex1] A: 0.67156 | B: 0.67214 | C: 0.66856 [LOGITS Ex2 A] Mean Abs: 1.496 | Max: 5.494 [LOSS Ex2] A: 0.26582 | B: 0.41595 | C: 0.35416 ** [JOINT LOSS] ** : 1.016061 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008499 | Grad Max: 0.247885 -> Layer: shared_layers.0.bias | Grad Mean: 0.414963 | Grad Max: 1.964271 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.009092 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002839 | Grad Max: 0.002839 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003017 | Grad Max: 0.256202 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056315 | Grad Max: 1.359728 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000641 | Grad Max: 0.015634 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025793 | Grad Max: 0.089253 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.001151 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005764 | Grad Max: 0.012593 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000490 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001852 | Grad Max: 0.004381 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003374 | Grad Max: 0.006115 -> Layer: exit2_layers.12.bias | Grad Mean: 0.054384 | Grad Max: 0.054384 [GRADIENT NORM TOTAL] 8.0997 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.313 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061659 0.4938341] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 323/1725 | B: 302/1554 | C: 269/1779 [LOSS Ex1] A: 0.67523 | B: 0.67473 | C: 0.66994 [LOGITS Ex2 A] Mean Abs: 1.445 | Max: 5.276 [LOSS Ex2] A: 0.23793 | B: 0.39695 | C: 0.31754 ** [JOINT LOSS] ** : 0.990775 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003053 | Grad Max: 0.081947 -> Layer: shared_layers.0.bias | Grad Mean: 0.201856 | Grad Max: 0.960309 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001910 | Grad Max: 0.007706 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005930 | Grad Max: 0.005930 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001388 | Grad Max: 0.132629 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026115 | Grad Max: 0.756341 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000297 | Grad Max: 0.007266 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012018 | Grad Max: 0.044325 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000530 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002643 | Grad Max: 0.005972 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000258 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000862 | Grad Max: 0.002168 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001501 | Grad Max: 0.004006 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025943 | Grad Max: 0.025943 [GRADIENT NORM TOTAL] 4.0163 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.216 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5187216 0.48127842] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 321/1727 | B: 307/1741 | C: 269/1779 [LOSS Ex1] A: 0.67475 | B: 0.67418 | C: 0.66947 [LOGITS Ex2 A] Mean Abs: 1.367 | Max: 5.640 [LOSS Ex2] A: 0.24942 | B: 0.42555 | C: 0.35640 ** [JOINT LOSS] ** : 1.016592 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004380 | Grad Max: 0.096356 -> Layer: shared_layers.0.bias | Grad Mean: 0.230850 | Grad Max: 1.059020 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001910 | Grad Max: 0.007797 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001464 | Grad Max: 0.001464 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001610 | Grad Max: 0.210220 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029965 | Grad Max: 1.186436 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.009176 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013306 | Grad Max: 0.051251 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000624 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002956 | Grad Max: 0.006914 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000313 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000952 | Grad Max: 0.002719 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001836 | Grad Max: 0.003630 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028358 | Grad Max: 0.028358 [GRADIENT NORM TOTAL] 4.5623 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.346 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5757721 0.42422792] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.024 [MASKS] A(Pass/Fail): 361/1687 | B: 320/1728 | C: 257/1791 [LOSS Ex1] A: 0.67269 | B: 0.67465 | C: 0.67097 [LOGITS Ex2 A] Mean Abs: 1.422 | Max: 5.493 [LOSS Ex2] A: 0.23309 | B: 0.42223 | C: 0.35879 ** [JOINT LOSS] ** : 1.010808 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006571 | Grad Max: 0.140868 -> Layer: shared_layers.0.bias | Grad Mean: 0.345074 | Grad Max: 1.612147 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.008955 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014927 | Grad Max: 0.014927 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002362 | Grad Max: 0.243817 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044677 | Grad Max: 1.369806 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000510 | Grad Max: 0.012645 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020707 | Grad Max: 0.073093 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000895 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004603 | Grad Max: 0.009990 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000405 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001471 | Grad Max: 0.003597 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002679 | Grad Max: 0.004847 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043245 | Grad Max: 0.043245 [GRADIENT NORM TOTAL] 6.5637 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.387 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5018333 0.49816668] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 334/1714 | B: 337/1711 | C: 247/1801 [LOSS Ex1] A: 0.67561 | B: 0.67204 | C: 0.67011 [LOGITS Ex2 A] Mean Abs: 1.429 | Max: 5.414 [LOSS Ex2] A: 0.23696 | B: 0.40309 | C: 0.35957 ** [JOINT LOSS] ** : 1.005790 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004796 | Grad Max: 0.122278 -> Layer: shared_layers.0.bias | Grad Mean: 0.161628 | Grad Max: 0.738087 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.006920 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001955 | Grad Max: 0.001955 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001197 | Grad Max: 0.109482 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021861 | Grad Max: 0.544010 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000252 | Grad Max: 0.006445 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010061 | Grad Max: 0.036301 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000459 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002286 | Grad Max: 0.005206 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000230 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000733 | Grad Max: 0.001933 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001438 | Grad Max: 0.003292 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021587 | Grad Max: 0.021587 [GRADIENT NORM TOTAL] 3.0091 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.191 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.56983334 0.4301666 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 330/1718 | B: 304/1552 | C: 243/1805 [LOSS Ex1] A: 0.67394 | B: 0.67463 | C: 0.67091 [LOGITS Ex2 A] Mean Abs: 1.471 | Max: 4.934 [LOSS Ex2] A: 0.25913 | B: 0.39655 | C: 0.36861 ** [JOINT LOSS] ** : 1.014590 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004916 | Grad Max: 0.114569 -> Layer: shared_layers.0.bias | Grad Mean: 0.282052 | Grad Max: 1.417381 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001980 | Grad Max: 0.008726 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009197 | Grad Max: 0.009197 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.169571 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037542 | Grad Max: 0.947580 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000434 | Grad Max: 0.009851 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017664 | Grad Max: 0.061555 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000844 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003888 | Grad Max: 0.008719 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000338 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001243 | Grad Max: 0.003037 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002127 | Grad Max: 0.004141 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036037 | Grad Max: 0.036037 [GRADIENT NORM TOTAL] 5.5678 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.254 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5427067 0.4572933] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 281/1335 | B: 307/1741 | C: 282/1766 [LOSS Ex1] A: 0.67296 | B: 0.67408 | C: 0.66731 [LOGITS Ex2 A] Mean Abs: 1.530 | Max: 5.176 [LOSS Ex2] A: 0.24871 | B: 0.43854 | C: 0.33652 ** [JOINT LOSS] ** : 1.012708 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006870 | Grad Max: 0.162918 -> Layer: shared_layers.0.bias | Grad Mean: 0.359780 | Grad Max: 1.718003 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002053 | Grad Max: 0.008061 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002393 | Grad Max: 0.002393 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002474 | Grad Max: 0.215579 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046797 | Grad Max: 1.079177 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000520 | Grad Max: 0.014018 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021142 | Grad Max: 0.079253 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000959 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004680 | Grad Max: 0.010559 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000406 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001494 | Grad Max: 0.003650 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002558 | Grad Max: 0.004614 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042862 | Grad Max: 0.042862 [GRADIENT NORM TOTAL] 6.8881 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.388 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070343 0.4929657] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 355/1693 | B: 322/1726 | C: 272/1776 [LOSS Ex1] A: 0.67319 | B: 0.67455 | C: 0.66937 [LOGITS Ex2 A] Mean Abs: 1.490 | Max: 5.213 [LOSS Ex2] A: 0.24021 | B: 0.42016 | C: 0.35874 ** [JOINT LOSS] ** : 1.012076 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004637 | Grad Max: 0.105715 -> Layer: shared_layers.0.bias | Grad Mean: 0.227019 | Grad Max: 1.062659 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001968 | Grad Max: 0.008151 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005481 | Grad Max: 0.005481 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001592 | Grad Max: 0.127068 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030012 | Grad Max: 0.668789 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.009104 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013491 | Grad Max: 0.049564 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000697 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003008 | Grad Max: 0.007798 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000282 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000971 | Grad Max: 0.002486 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001780 | Grad Max: 0.003940 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028997 | Grad Max: 0.028997 [GRADIENT NORM TOTAL] 4.3583 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.362 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5038211 0.49617893] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 368/1680 | B: 337/1711 | C: 248/1800 [LOSS Ex1] A: 0.67225 | B: 0.67193 | C: 0.67146 [LOGITS Ex2 A] Mean Abs: 1.437 | Max: 6.041 [LOSS Ex2] A: 0.23847 | B: 0.40604 | C: 0.33444 ** [JOINT LOSS] ** : 0.998199 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003294 | Grad Max: 0.067005 -> Layer: shared_layers.0.bias | Grad Mean: 0.198651 | Grad Max: 0.936816 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.009321 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014423 | Grad Max: 0.014423 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001351 | Grad Max: 0.155874 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025233 | Grad Max: 0.882567 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000282 | Grad Max: 0.006890 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011478 | Grad Max: 0.038858 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000572 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002547 | Grad Max: 0.005877 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000260 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000805 | Grad Max: 0.002125 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001447 | Grad Max: 0.003335 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022929 | Grad Max: 0.022929 [GRADIENT NORM TOTAL] 3.8441 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.372 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50317067 0.4968293 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 348/1700 | B: 304/1552 | C: 260/1788 [LOSS Ex1] A: 0.67134 | B: 0.67454 | C: 0.66928 [LOGITS Ex2 A] Mean Abs: 1.425 | Max: 6.182 [LOSS Ex2] A: 0.25569 | B: 0.40224 | C: 0.32377 ** [JOINT LOSS] ** : 0.998955 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004139 | Grad Max: 0.119781 -> Layer: shared_layers.0.bias | Grad Mean: 0.343926 | Grad Max: 1.588207 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.009617 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011643 | Grad Max: 0.011643 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.248781 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042160 | Grad Max: 1.395509 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000474 | Grad Max: 0.014206 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019403 | Grad Max: 0.084519 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.001011 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004254 | Grad Max: 0.010517 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000357 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001338 | Grad Max: 0.003252 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002223 | Grad Max: 0.004204 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036953 | Grad Max: 0.036953 [GRADIENT NORM TOTAL] 6.7585 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.315 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061435 0.49385652] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.024 [MASKS] A(Pass/Fail): 327/1721 | B: 307/1741 | C: 259/1789 [LOSS Ex1] A: 0.67505 | B: 0.67398 | C: 0.67001 [LOGITS Ex2 A] Mean Abs: 1.409 | Max: 5.944 [LOSS Ex2] A: 0.23175 | B: 0.43513 | C: 0.34484 ** [JOINT LOSS] ** : 1.010254 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002467 | Grad Max: 0.062172 -> Layer: shared_layers.0.bias | Grad Mean: 0.180236 | Grad Max: 0.829815 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001948 | Grad Max: 0.007788 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008998 | Grad Max: 0.008998 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001200 | Grad Max: 0.163586 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022437 | Grad Max: 0.929825 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.007071 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010023 | Grad Max: 0.042144 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000504 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002198 | Grad Max: 0.005241 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000234 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000691 | Grad Max: 0.001857 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001211 | Grad Max: 0.002655 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019130 | Grad Max: 0.019130 [GRADIENT NORM TOTAL] 3.6128 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.217 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51891583 0.4810841 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.023 [MASKS] A(Pass/Fail): 327/1721 | B: 325/1723 | C: 240/1808 [LOSS Ex1] A: 0.67458 | B: 0.67445 | C: 0.67134 [LOGITS Ex2 A] Mean Abs: 1.428 | Max: 5.975 [LOSS Ex2] A: 0.25383 | B: 0.42583 | C: 0.37255 ** [JOINT LOSS] ** : 1.024194 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004384 | Grad Max: 0.116961 -> Layer: shared_layers.0.bias | Grad Mean: 0.183233 | Grad Max: 0.814503 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001847 | Grad Max: 0.007299 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002385 | Grad Max: 0.002385 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001278 | Grad Max: 0.101625 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023693 | Grad Max: 0.515517 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000274 | Grad Max: 0.006649 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011095 | Grad Max: 0.039879 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000604 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002481 | Grad Max: 0.006214 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000242 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000780 | Grad Max: 0.002060 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001254 | Grad Max: 0.002730 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021212 | Grad Max: 0.021212 [GRADIENT NORM TOTAL] 3.4462 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.348 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57665694 0.4233431 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.024 [MASKS] A(Pass/Fail): 365/1683 | B: 337/1711 | C: 249/1799 [LOSS Ex1] A: 0.67249 | B: 0.67183 | C: 0.66977 [LOGITS Ex2 A] Mean Abs: 1.473 | Max: 5.409 [LOSS Ex2] A: 0.23475 | B: 0.41296 | C: 0.35357 ** [JOINT LOSS] ** : 1.005122 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005777 | Grad Max: 0.150724 -> Layer: shared_layers.0.bias | Grad Mean: 0.289286 | Grad Max: 1.326064 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.008883 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008923 | Grad Max: 0.008923 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002012 | Grad Max: 0.201273 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037787 | Grad Max: 1.021285 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000426 | Grad Max: 0.009132 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017238 | Grad Max: 0.057017 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000811 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003809 | Grad Max: 0.009134 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000330 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001222 | Grad Max: 0.002982 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002148 | Grad Max: 0.004764 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035822 | Grad Max: 0.035822 [GRADIENT NORM TOTAL] 5.5407 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.390 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5018011 0.49819887] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 337/1711 | B: 305/1551 | C: 172/1204 [LOSS Ex1] A: 0.67543 | B: 0.67444 | C: 0.66885 [LOGITS Ex2 A] Mean Abs: 1.455 | Max: 5.055 [LOSS Ex2] A: 0.22164 | B: 0.40126 | C: 0.34001 ** [JOINT LOSS] ** : 0.993878 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.049307 -> Layer: shared_layers.0.bias | Grad Mean: 0.064425 | Grad Max: 0.292392 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001918 | Grad Max: 0.007677 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006317 | Grad Max: 0.006317 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000508 | Grad Max: 0.065038 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008576 | Grad Max: 0.360260 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000078 | Grad Max: 0.004464 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002888 | Grad Max: 0.022902 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000219 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000577 | Grad Max: 0.002176 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000114 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000189 | Grad Max: 0.000800 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000603 | Grad Max: 0.001706 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005836 | Grad Max: 0.005836 [GRADIENT NORM TOTAL] 1.3869 [EPOCH SUMMARY] Train Loss: 1.0079 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9889 | Alpha: 0.5500 No improve count: 3/15 ############################## EPOCH 51/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.193 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5706051 0.42939496] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 334/1714 | B: 307/1741 | C: 293/1755 [LOSS Ex1] A: 0.67374 | B: 0.67387 | C: 0.66797 [LOGITS Ex2 A] Mean Abs: 1.404 | Max: 5.588 [LOSS Ex2] A: 0.24576 | B: 0.43767 | C: 0.33676 ** [JOINT LOSS] ** : 1.011920 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005406 | Grad Max: 0.113882 -> Layer: shared_layers.0.bias | Grad Mean: 0.349164 | Grad Max: 1.593858 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001997 | Grad Max: 0.008270 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004772 | Grad Max: 0.004772 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002382 | Grad Max: 0.256129 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045010 | Grad Max: 1.447117 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000504 | Grad Max: 0.012334 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020625 | Grad Max: 0.073955 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000926 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004592 | Grad Max: 0.010659 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000379 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001461 | Grad Max: 0.003545 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002473 | Grad Max: 0.004379 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040549 | Grad Max: 0.040549 [GRADIENT NORM TOTAL] 6.7722 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.256 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54322463 0.45677543] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.024 [MASKS] A(Pass/Fail): 285/1331 | B: 327/1721 | C: 245/1803 [LOSS Ex1] A: 0.67276 | B: 0.67434 | C: 0.67094 [LOGITS Ex2 A] Mean Abs: 1.457 | Max: 5.700 [LOSS Ex2] A: 0.24072 | B: 0.43893 | C: 0.33051 ** [JOINT LOSS] ** : 1.009400 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007222 | Grad Max: 0.170503 -> Layer: shared_layers.0.bias | Grad Mean: 0.418495 | Grad Max: 1.896594 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001948 | Grad Max: 0.008402 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004747 | Grad Max: 0.004747 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002884 | Grad Max: 0.293898 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054185 | Grad Max: 1.663638 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000624 | Grad Max: 0.015625 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025368 | Grad Max: 0.090619 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001058 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005634 | Grad Max: 0.011815 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000481 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001808 | Grad Max: 0.004284 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003256 | Grad Max: 0.005914 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053134 | Grad Max: 0.053134 [GRADIENT NORM TOTAL] 8.0885 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.391 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50701725 0.4929827 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.025 [MASKS] A(Pass/Fail): 359/1689 | B: 337/1711 | C: 244/1804 [LOSS Ex1] A: 0.67299 | B: 0.67171 | C: 0.66885 [LOGITS Ex2 A] Mean Abs: 1.459 | Max: 5.254 [LOSS Ex2] A: 0.24253 | B: 0.40792 | C: 0.36284 ** [JOINT LOSS] ** : 1.008945 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003007 | Grad Max: 0.076758 -> Layer: shared_layers.0.bias | Grad Mean: 0.182944 | Grad Max: 0.777063 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002088 | Grad Max: 0.008532 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005743 | Grad Max: 0.005743 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001289 | Grad Max: 0.209590 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024018 | Grad Max: 1.184287 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.006856 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011197 | Grad Max: 0.042113 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000536 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002485 | Grad Max: 0.005520 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000225 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000795 | Grad Max: 0.002069 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001494 | Grad Max: 0.003022 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023738 | Grad Max: 0.023738 [GRADIENT NORM TOTAL] 3.7603 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.365 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5038717 0.49612832] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 372/1676 | B: 305/1551 | C: 259/1789 [LOSS Ex1] A: 0.67203 | B: 0.67432 | C: 0.66894 [LOGITS Ex2 A] Mean Abs: 1.495 | Max: 5.345 [LOSS Ex2] A: 0.24386 | B: 0.39652 | C: 0.36311 ** [JOINT LOSS] ** : 1.006262 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005893 | Grad Max: 0.143330 -> Layer: shared_layers.0.bias | Grad Mean: 0.282829 | Grad Max: 1.372583 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.009033 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010997 | Grad Max: 0.010997 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.196181 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037764 | Grad Max: 1.024901 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000418 | Grad Max: 0.011324 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016991 | Grad Max: 0.062707 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000842 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003770 | Grad Max: 0.008636 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000393 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001206 | Grad Max: 0.003129 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002136 | Grad Max: 0.004650 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035279 | Grad Max: 0.035279 [GRADIENT NORM TOTAL] 5.4894 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.375 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5031238 0.49687618] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 351/1697 | B: 308/1740 | C: 255/1793 [LOSS Ex1] A: 0.67113 | B: 0.67376 | C: 0.67172 [LOGITS Ex2 A] Mean Abs: 1.484 | Max: 6.915 [LOSS Ex2] A: 0.27138 | B: 0.43714 | C: 0.35069 ** [JOINT LOSS] ** : 1.025271 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008802 | Grad Max: 0.245253 -> Layer: shared_layers.0.bias | Grad Mean: 0.426945 | Grad Max: 1.987905 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.009595 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014022 | Grad Max: 0.014022 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003072 | Grad Max: 0.240826 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057419 | Grad Max: 1.316973 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000641 | Grad Max: 0.015104 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025990 | Grad Max: 0.090110 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.001168 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005808 | Grad Max: 0.013120 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000495 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001854 | Grad Max: 0.004561 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003276 | Grad Max: 0.006090 -> Layer: exit2_layers.12.bias | Grad Mean: 0.054083 | Grad Max: 0.054083 [GRADIENT NORM TOTAL] 8.2265 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.318 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50609255 0.49390745] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.521 | Std: 0.024 [MASKS] A(Pass/Fail): 329/1719 | B: 327/1721 | C: 240/1808 [LOSS Ex1] A: 0.67487 | B: 0.67423 | C: 0.67163 [LOGITS Ex2 A] Mean Abs: 1.464 | Max: 5.399 [LOSS Ex2] A: 0.24963 | B: 0.42029 | C: 0.35817 ** [JOINT LOSS] ** : 1.016273 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005459 | Grad Max: 0.123672 -> Layer: shared_layers.0.bias | Grad Mean: 0.329206 | Grad Max: 1.507023 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001875 | Grad Max: 0.007266 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006443 | Grad Max: 0.006443 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002213 | Grad Max: 0.189990 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041676 | Grad Max: 1.034025 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000470 | Grad Max: 0.012822 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019195 | Grad Max: 0.077233 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000943 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004265 | Grad Max: 0.010196 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000365 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001359 | Grad Max: 0.003242 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002181 | Grad Max: 0.004605 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037738 | Grad Max: 0.037738 [GRADIENT NORM TOTAL] 6.3299 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.053 | Max: 0.219 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51906127 0.48093876] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.521 | Std: 0.024 [MASKS] A(Pass/Fail): 333/1715 | B: 338/1710 | C: 255/1793 [LOSS Ex1] A: 0.67440 | B: 0.67159 | C: 0.67069 [LOGITS Ex2 A] Mean Abs: 1.391 | Max: 5.495 [LOSS Ex2] A: 0.23904 | B: 0.40527 | C: 0.34280 ** [JOINT LOSS] ** : 1.001261 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002908 | Grad Max: 0.062837 -> Layer: shared_layers.0.bias | Grad Mean: 0.129995 | Grad Max: 0.601151 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001928 | Grad Max: 0.007641 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001021 | Grad Max: 0.001021 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000897 | Grad Max: 0.063186 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016896 | Grad Max: 0.347359 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000205 | Grad Max: 0.005721 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008270 | Grad Max: 0.031166 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000442 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001822 | Grad Max: 0.004416 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000194 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000566 | Grad Max: 0.001676 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000968 | Grad Max: 0.002576 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015068 | Grad Max: 0.015068 [GRADIENT NORM TOTAL] 2.4812 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.351 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5775888 0.4224112] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.025 [MASKS] A(Pass/Fail): 369/1679 | B: 307/1549 | C: 273/1775 [LOSS Ex1] A: 0.67228 | B: 0.67421 | C: 0.66785 [LOGITS Ex2 A] Mean Abs: 1.447 | Max: 6.206 [LOSS Ex2] A: 0.21780 | B: 0.40282 | C: 0.33645 ** [JOINT LOSS] ** : 0.990476 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002927 | Grad Max: 0.068813 -> Layer: shared_layers.0.bias | Grad Mean: 0.233869 | Grad Max: 1.039296 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.008729 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007197 | Grad Max: 0.007197 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001507 | Grad Max: 0.113691 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028524 | Grad Max: 0.651593 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.008516 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013674 | Grad Max: 0.054754 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000711 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003007 | Grad Max: 0.007338 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000285 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000936 | Grad Max: 0.002607 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001543 | Grad Max: 0.003600 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025474 | Grad Max: 0.025474 [GRADIENT NORM TOTAL] 4.4319 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.393 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017548 0.49824515] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 343/1705 | B: 308/1740 | C: 255/1793 [LOSS Ex1] A: 0.67525 | B: 0.67365 | C: 0.66883 [LOGITS Ex2 A] Mean Abs: 1.453 | Max: 5.435 [LOSS Ex2] A: 0.24170 | B: 0.42138 | C: 0.34632 ** [JOINT LOSS] ** : 1.009042 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001950 | Grad Max: 0.048691 -> Layer: shared_layers.0.bias | Grad Mean: 0.028358 | Grad Max: 0.201440 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001935 | Grad Max: 0.007708 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005922 | Grad Max: 0.005922 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000301 | Grad Max: 0.039378 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004421 | Grad Max: 0.209209 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002123 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001208 | Grad Max: 0.007498 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000204 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000259 | Grad Max: 0.001736 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000076 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000451 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000361 | Grad Max: 0.001192 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002419 | Grad Max: 0.002419 [GRADIENT NORM TOTAL] 0.6784 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.195 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57136565 0.42863432] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 342/1706 | B: 329/1719 | C: 250/1798 [LOSS Ex1] A: 0.67354 | B: 0.67411 | C: 0.66940 [LOGITS Ex2 A] Mean Abs: 1.481 | Max: 5.206 [LOSS Ex2] A: 0.25827 | B: 0.42602 | C: 0.33902 ** [JOINT LOSS] ** : 1.013459 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004544 | Grad Max: 0.120813 -> Layer: shared_layers.0.bias | Grad Mean: 0.233195 | Grad Max: 1.037590 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001923 | Grad Max: 0.007819 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001018 | Grad Max: 0.001018 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001603 | Grad Max: 0.140401 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029895 | Grad Max: 0.765885 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000331 | Grad Max: 0.009842 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013483 | Grad Max: 0.055523 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000646 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002986 | Grad Max: 0.007161 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000271 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000954 | Grad Max: 0.002293 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001663 | Grad Max: 0.003583 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027286 | Grad Max: 0.027286 [GRADIENT NORM TOTAL] 4.5088 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.259 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54371023 0.4562898 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.025 [MASKS] A(Pass/Fail): 288/1328 | B: 341/1707 | C: 263/1785 [LOSS Ex1] A: 0.67255 | B: 0.67146 | C: 0.66952 [LOGITS Ex2 A] Mean Abs: 1.520 | Max: 6.442 [LOSS Ex2] A: 0.23263 | B: 0.40570 | C: 0.34564 ** [JOINT LOSS] ** : 0.999165 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004897 | Grad Max: 0.110447 -> Layer: shared_layers.0.bias | Grad Mean: 0.221893 | Grad Max: 1.042382 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.008159 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002253 | Grad Max: 0.002253 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001569 | Grad Max: 0.121090 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029158 | Grad Max: 0.646262 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000326 | Grad Max: 0.008435 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013258 | Grad Max: 0.044259 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000696 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002959 | Grad Max: 0.007226 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000279 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000937 | Grad Max: 0.002434 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001479 | Grad Max: 0.003434 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026422 | Grad Max: 0.026422 [GRADIENT NORM TOTAL] 4.2203 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.395 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50698644 0.49301353] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.523 | Std: 0.025 [MASKS] A(Pass/Fail): 361/1687 | B: 308/1548 | C: 262/1786 [LOSS Ex1] A: 0.67277 | B: 0.67409 | C: 0.66831 [LOGITS Ex2 A] Mean Abs: 1.458 | Max: 5.416 [LOSS Ex2] A: 0.24037 | B: 0.39634 | C: 0.36041 ** [JOINT LOSS] ** : 1.004096 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001731 | Grad Max: 0.044744 -> Layer: shared_layers.0.bias | Grad Mean: 0.088693 | Grad Max: 0.447580 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.009185 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012273 | Grad Max: 0.012273 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000689 | Grad Max: 0.059233 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012271 | Grad Max: 0.328681 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.004253 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005018 | Grad Max: 0.024474 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000304 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001089 | Grad Max: 0.003222 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000147 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000341 | Grad Max: 0.001102 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000563 | Grad Max: 0.001917 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009625 | Grad Max: 0.009625 [GRADIENT NORM TOTAL] 1.8493 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.368 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50390667 0.4960933 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.523 | Std: 0.025 [MASKS] A(Pass/Fail): 380/1668 | B: 308/1740 | C: 266/1782 [LOSS Ex1] A: 0.67178 | B: 0.67351 | C: 0.66861 [LOGITS Ex2 A] Mean Abs: 1.451 | Max: 5.265 [LOSS Ex2] A: 0.24000 | B: 0.43203 | C: 0.34466 ** [JOINT LOSS] ** : 1.010198 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002331 | Grad Max: 0.059156 -> Layer: shared_layers.0.bias | Grad Mean: 0.155309 | Grad Max: 0.772081 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.009109 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011073 | Grad Max: 0.011073 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001108 | Grad Max: 0.131922 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020317 | Grad Max: 0.720941 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000227 | Grad Max: 0.005944 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009321 | Grad Max: 0.033575 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000420 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002060 | Grad Max: 0.004946 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000205 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000655 | Grad Max: 0.001760 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001209 | Grad Max: 0.002438 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018914 | Grad Max: 0.018914 [GRADIENT NORM TOTAL] 3.1740 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.378 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50308394 0.49691606] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.025 [MASKS] A(Pass/Fail): 355/1693 | B: 333/1715 | C: 182/1194 [LOSS Ex1] A: 0.67087 | B: 0.67398 | C: 0.66881 [LOGITS Ex2 A] Mean Abs: 1.473 | Max: 5.976 [LOSS Ex2] A: 0.24768 | B: 0.41773 | C: 0.34392 ** [JOINT LOSS] ** : 1.007663 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002763 | Grad Max: 0.088366 -> Layer: shared_layers.0.bias | Grad Mean: 0.086101 | Grad Max: 0.312898 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.007928 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000863 | Grad Max: 0.000863 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000661 | Grad Max: 0.052777 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011397 | Grad Max: 0.299911 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000106 | Grad Max: 0.004105 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004158 | Grad Max: 0.020472 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000338 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000968 | Grad Max: 0.003148 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000127 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000300 | Grad Max: 0.000954 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000429 | Grad Max: 0.001935 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007635 | Grad Max: 0.007635 [GRADIENT NORM TOTAL] 1.6538 [EPOCH SUMMARY] Train Loss: 1.0081 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9761 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9849 -> New: 0.9761) ############################## EPOCH 52/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.320 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50605214 0.49394783] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 338/1710 | B: 344/1704 | C: 256/1792 [LOSS Ex1] A: 0.67464 | B: 0.67132 | C: 0.66985 [LOGITS Ex2 A] Mean Abs: 1.444 | Max: 5.738 [LOSS Ex2] A: 0.24374 | B: 0.39594 | C: 0.34865 ** [JOINT LOSS] ** : 1.001379 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.053680 -> Layer: shared_layers.0.bias | Grad Mean: 0.048953 | Grad Max: 0.209079 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001914 | Grad Max: 0.007299 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002002 | Grad Max: 0.002002 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000425 | Grad Max: 0.048402 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006931 | Grad Max: 0.277079 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003604 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001514 | Grad Max: 0.016461 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000173 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000262 | Grad Max: 0.001974 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000077 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000081 | Grad Max: 0.000455 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000448 | Grad Max: 0.001265 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000725 | Grad Max: 0.000725 [GRADIENT NORM TOTAL] 1.0838 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.054 | Max: 0.221 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51926935 0.48073068] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 341/1707 | B: 310/1546 | C: 278/1770 [LOSS Ex1] A: 0.67414 | B: 0.67394 | C: 0.66780 [LOGITS Ex2 A] Mean Abs: 1.416 | Max: 5.740 [LOSS Ex2] A: 0.23852 | B: 0.38958 | C: 0.34240 ** [JOINT LOSS] ** : 0.995461 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002903 | Grad Max: 0.078113 -> Layer: shared_layers.0.bias | Grad Mean: 0.129526 | Grad Max: 0.535633 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001974 | Grad Max: 0.008751 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006909 | Grad Max: 0.006909 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000918 | Grad Max: 0.153267 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016560 | Grad Max: 0.863587 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.004509 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006695 | Grad Max: 0.026377 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000431 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001519 | Grad Max: 0.004123 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000170 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000470 | Grad Max: 0.001433 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000755 | Grad Max: 0.002363 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012301 | Grad Max: 0.012301 [GRADIENT NORM TOTAL] 2.6861 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.355 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5788781 0.4211219] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.025 [MASKS] A(Pass/Fail): 371/1677 | B: 310/1738 | C: 252/1796 [LOSS Ex1] A: 0.67198 | B: 0.67335 | C: 0.66757 [LOGITS Ex2 A] Mean Abs: 1.474 | Max: 5.149 [LOSS Ex2] A: 0.22332 | B: 0.42433 | C: 0.33215 ** [JOINT LOSS] ** : 0.997569 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001852 | Grad Max: 0.042809 -> Layer: shared_layers.0.bias | Grad Mean: 0.052470 | Grad Max: 0.205765 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.008788 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009730 | Grad Max: 0.009730 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000409 | Grad Max: 0.136052 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006459 | Grad Max: 0.774613 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002440 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001159 | Grad Max: 0.013494 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000155 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000231 | Grad Max: 0.001310 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000089 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000081 | Grad Max: 0.000524 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000493 | Grad Max: 0.001187 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000195 | Grad Max: 0.000195 [GRADIENT NORM TOTAL] 1.5768 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.398 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017345 0.4982655] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.025 [MASKS] A(Pass/Fail): 350/1698 | B: 335/1713 | C: 272/1776 [LOSS Ex1] A: 0.67497 | B: 0.67381 | C: 0.66852 [LOGITS Ex2 A] Mean Abs: 1.486 | Max: 5.325 [LOSS Ex2] A: 0.23687 | B: 0.41176 | C: 0.31996 ** [JOINT LOSS] ** : 0.995297 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002615 | Grad Max: 0.058875 -> Layer: shared_layers.0.bias | Grad Mean: 0.139443 | Grad Max: 0.684730 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001929 | Grad Max: 0.007885 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007187 | Grad Max: 0.007187 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001008 | Grad Max: 0.090277 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018438 | Grad Max: 0.498375 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000210 | Grad Max: 0.007104 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008548 | Grad Max: 0.040166 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000393 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001819 | Grad Max: 0.004317 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000201 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000584 | Grad Max: 0.001497 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000941 | Grad Max: 0.002865 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017094 | Grad Max: 0.017094 [GRADIENT NORM TOTAL] 2.7949 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.198 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5725275 0.42747247] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.025 [MASKS] A(Pass/Fail): 351/1697 | B: 347/1701 | C: 285/1763 [LOSS Ex1] A: 0.67323 | B: 0.67113 | C: 0.66819 [LOGITS Ex2 A] Mean Abs: 1.476 | Max: 5.314 [LOSS Ex2] A: 0.24148 | B: 0.39948 | C: 0.33713 ** [JOINT LOSS] ** : 0.996880 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001743 | Grad Max: 0.032996 -> Layer: shared_layers.0.bias | Grad Mean: 0.044438 | Grad Max: 0.211552 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.008527 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010330 | Grad Max: 0.010330 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000427 | Grad Max: 0.055054 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007386 | Grad Max: 0.301503 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.003047 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002802 | Grad Max: 0.015893 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000251 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000600 | Grad Max: 0.002076 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000101 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000200 | Grad Max: 0.000632 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000470 | Grad Max: 0.001890 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006724 | Grad Max: 0.006724 [GRADIENT NORM TOTAL] 1.0432 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.264 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5445401 0.45545983] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.025 [MASKS] A(Pass/Fail): 295/1321 | B: 310/1546 | C: 277/1771 [LOSS Ex1] A: 0.67220 | B: 0.67376 | C: 0.66725 [LOGITS Ex2 A] Mean Abs: 1.477 | Max: 6.139 [LOSS Ex2] A: 0.23804 | B: 0.40799 | C: 0.34584 ** [JOINT LOSS] ** : 1.001694 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006550 | Grad Max: 0.168087 -> Layer: shared_layers.0.bias | Grad Mean: 0.436391 | Grad Max: 2.054043 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.007778 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001986 | Grad Max: 0.001986 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002837 | Grad Max: 0.254246 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053996 | Grad Max: 1.388732 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000630 | Grad Max: 0.014795 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025994 | Grad Max: 0.090741 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001136 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005679 | Grad Max: 0.012365 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000487 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001787 | Grad Max: 0.004292 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003090 | Grad Max: 0.005582 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050916 | Grad Max: 0.050916 [GRADIENT NORM TOTAL] 8.1977 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.400 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5069848 0.49301517] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.523 | Std: 0.025 [MASKS] A(Pass/Fail): 375/1673 | B: 311/1737 | C: 231/1817 [LOSS Ex1] A: 0.67242 | B: 0.67316 | C: 0.67270 [LOGITS Ex2 A] Mean Abs: 1.461 | Max: 6.300 [LOSS Ex2] A: 0.24286 | B: 0.45576 | C: 0.36073 ** [JOINT LOSS] ** : 1.025877 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005581 | Grad Max: 0.161819 -> Layer: shared_layers.0.bias | Grad Mean: 0.442567 | Grad Max: 2.098195 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.008939 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016120 | Grad Max: 0.016120 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002884 | Grad Max: 0.221121 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054799 | Grad Max: 1.219781 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000624 | Grad Max: 0.016208 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025893 | Grad Max: 0.095369 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001151 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005664 | Grad Max: 0.012210 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000462 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001787 | Grad Max: 0.004038 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003050 | Grad Max: 0.006117 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051169 | Grad Max: 0.051169 [GRADIENT NORM TOTAL] 8.4464 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.372 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5039592 0.49604082] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.523 | Std: 0.025 [MASKS] A(Pass/Fail): 391/1657 | B: 336/1712 | C: 244/1804 [LOSS Ex1] A: 0.67141 | B: 0.67362 | C: 0.67088 [LOGITS Ex2 A] Mean Abs: 1.474 | Max: 5.549 [LOSS Ex2] A: 0.22616 | B: 0.42304 | C: 0.34715 ** [JOINT LOSS] ** : 1.004089 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003123 | Grad Max: 0.078578 -> Layer: shared_layers.0.bias | Grad Mean: 0.221815 | Grad Max: 1.082166 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002140 | Grad Max: 0.009187 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014580 | Grad Max: 0.014580 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001473 | Grad Max: 0.104000 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027388 | Grad Max: 0.560247 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.008338 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012976 | Grad Max: 0.051796 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000604 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002778 | Grad Max: 0.006338 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000261 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000869 | Grad Max: 0.002266 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001567 | Grad Max: 0.003318 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025034 | Grad Max: 0.025034 [GRADIENT NORM TOTAL] 4.2369 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.383 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5030145 0.49698552] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.523 | Std: 0.025 [MASKS] A(Pass/Fail): 365/1683 | B: 350/1698 | C: 260/1788 [LOSS Ex1] A: 0.67049 | B: 0.67093 | C: 0.66801 [LOGITS Ex2 A] Mean Abs: 1.512 | Max: 5.809 [LOSS Ex2] A: 0.26347 | B: 0.40903 | C: 0.33341 ** [JOINT LOSS] ** : 1.005115 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007399 | Grad Max: 0.265852 -> Layer: shared_layers.0.bias | Grad Mean: 0.299978 | Grad Max: 1.291453 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.008358 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000635 | Grad Max: 0.000635 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.208734 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041220 | Grad Max: 1.102593 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.009907 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018052 | Grad Max: 0.059380 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000840 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004069 | Grad Max: 0.009078 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000361 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001288 | Grad Max: 0.003220 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002295 | Grad Max: 0.004685 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037248 | Grad Max: 0.037248 [GRADIENT NORM TOTAL] 5.8836 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.324 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059813 0.49401867] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.025 [MASKS] A(Pass/Fail): 351/1697 | B: 312/1544 | C: 277/1771 [LOSS Ex1] A: 0.67432 | B: 0.67358 | C: 0.66767 [LOGITS Ex2 A] Mean Abs: 1.512 | Max: 5.429 [LOSS Ex2] A: 0.24243 | B: 0.40176 | C: 0.35352 ** [JOINT LOSS] ** : 1.004431 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009348 | Grad Max: 0.227232 -> Layer: shared_layers.0.bias | Grad Mean: 0.477093 | Grad Max: 2.134872 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001930 | Grad Max: 0.007773 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005854 | Grad Max: 0.005854 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003293 | Grad Max: 0.302185 -> Layer: exit2_layers.0.bias | Grad Mean: 0.061790 | Grad Max: 1.648027 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000695 | Grad Max: 0.016417 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028486 | Grad Max: 0.102069 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001210 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006307 | Grad Max: 0.013910 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000486 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001983 | Grad Max: 0.004648 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003460 | Grad Max: 0.006290 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057070 | Grad Max: 0.057070 [GRADIENT NORM TOTAL] 9.1035 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.223 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5196335 0.4803666] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.024 [MASKS] A(Pass/Fail): 351/1697 | B: 316/1732 | C: 243/1805 [LOSS Ex1] A: 0.67383 | B: 0.67298 | C: 0.67018 [LOGITS Ex2 A] Mean Abs: 1.473 | Max: 5.773 [LOSS Ex2] A: 0.24960 | B: 0.42749 | C: 0.34925 ** [JOINT LOSS] ** : 1.014440 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006188 | Grad Max: 0.185117 -> Layer: shared_layers.0.bias | Grad Mean: 0.303763 | Grad Max: 1.350207 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001860 | Grad Max: 0.007034 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003431 | Grad Max: 0.003431 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.196612 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039135 | Grad Max: 1.044422 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000446 | Grad Max: 0.010277 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018279 | Grad Max: 0.062921 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000854 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004061 | Grad Max: 0.009267 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000342 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001264 | Grad Max: 0.003066 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002126 | Grad Max: 0.004297 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035783 | Grad Max: 0.035783 [GRADIENT NORM TOTAL] 5.8035 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.360 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58064103 0.41935894] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.026 [MASKS] A(Pass/Fail): 383/1665 | B: 336/1712 | C: 244/1804 [LOSS Ex1] A: 0.67162 | B: 0.67346 | C: 0.67010 [LOGITS Ex2 A] Mean Abs: 1.474 | Max: 5.650 [LOSS Ex2] A: 0.22150 | B: 0.41459 | C: 0.34953 ** [JOINT LOSS] ** : 1.000268 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.048808 -> Layer: shared_layers.0.bias | Grad Mean: 0.102569 | Grad Max: 0.512850 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.008469 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009757 | Grad Max: 0.009757 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000783 | Grad Max: 0.071013 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014574 | Grad Max: 0.399306 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.004992 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007020 | Grad Max: 0.030098 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000381 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001533 | Grad Max: 0.003765 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000168 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000472 | Grad Max: 0.001434 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000916 | Grad Max: 0.002550 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014027 | Grad Max: 0.014027 [GRADIENT NORM TOTAL] 2.0570 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.402 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.501602 0.49839798] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.025 [MASKS] A(Pass/Fail): 357/1691 | B: 351/1697 | C: 272/1776 [LOSS Ex1] A: 0.67467 | B: 0.67076 | C: 0.66714 [LOGITS Ex2 A] Mean Abs: 1.485 | Max: 5.419 [LOSS Ex2] A: 0.22025 | B: 0.41202 | C: 0.33983 ** [JOINT LOSS] ** : 0.994884 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005426 | Grad Max: 0.149818 -> Layer: shared_layers.0.bias | Grad Mean: 0.193602 | Grad Max: 1.052187 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002002 | Grad Max: 0.007388 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001038 | Grad Max: 0.001038 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001507 | Grad Max: 0.160670 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027671 | Grad Max: 0.897604 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000315 | Grad Max: 0.007443 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012888 | Grad Max: 0.043275 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000662 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002892 | Grad Max: 0.007354 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000261 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000900 | Grad Max: 0.002252 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001665 | Grad Max: 0.003569 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025479 | Grad Max: 0.025479 [GRADIENT NORM TOTAL] 3.9206 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.202 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57389367 0.42610636] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.025 [MASKS] A(Pass/Fail): 366/1682 | B: 314/1542 | C: 173/1203 [LOSS Ex1] A: 0.67291 | B: 0.67343 | C: 0.66851 [LOGITS Ex2 A] Mean Abs: 1.502 | Max: 5.814 [LOSS Ex2] A: 0.23955 | B: 0.38610 | C: 0.34194 ** [JOINT LOSS] ** : 0.994144 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001828 | Grad Max: 0.031693 -> Layer: shared_layers.0.bias | Grad Mean: 0.070200 | Grad Max: 0.250505 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001956 | Grad Max: 0.007925 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002019 | Grad Max: 0.002019 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000515 | Grad Max: 0.111171 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009071 | Grad Max: 0.600321 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.003877 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003047 | Grad Max: 0.018669 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000248 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000666 | Grad Max: 0.002369 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000105 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000214 | Grad Max: 0.000790 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000475 | Grad Max: 0.001627 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005841 | Grad Max: 0.005841 [GRADIENT NORM TOTAL] 1.6256 [EPOCH SUMMARY] Train Loss: 1.0023 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9734 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9761 -> New: 0.9734) ############################## EPOCH 53/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.268 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5454524 0.45454758] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.026 [MASKS] A(Pass/Fail): 305/1311 | B: 320/1728 | C: 247/1801 [LOSS Ex1] A: 0.67189 | B: 0.67282 | C: 0.67102 [LOGITS Ex2 A] Mean Abs: 1.533 | Max: 6.024 [LOSS Ex2] A: 0.23107 | B: 0.42188 | C: 0.34147 ** [JOINT LOSS] ** : 1.003383 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001698 | Grad Max: 0.038736 -> Layer: shared_layers.0.bias | Grad Mean: 0.108362 | Grad Max: 0.486169 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.007978 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000791 | Grad Max: 0.000791 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000728 | Grad Max: 0.080964 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013374 | Grad Max: 0.459094 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000148 | Grad Max: 0.005074 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006072 | Grad Max: 0.026319 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000335 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001299 | Grad Max: 0.003304 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000148 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000391 | Grad Max: 0.001176 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000478 | Grad Max: 0.001890 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009524 | Grad Max: 0.009524 [GRADIENT NORM TOTAL] 2.1127 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.404 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50690526 0.49309477] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.523 | Std: 0.026 [MASKS] A(Pass/Fail): 392/1656 | B: 340/1708 | C: 245/1803 [LOSS Ex1] A: 0.67211 | B: 0.67330 | C: 0.67070 [LOGITS Ex2 A] Mean Abs: 1.515 | Max: 7.604 [LOSS Ex2] A: 0.23778 | B: 0.41270 | C: 0.35868 ** [JOINT LOSS] ** : 1.008424 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004628 | Grad Max: 0.135551 -> Layer: shared_layers.0.bias | Grad Mean: 0.165700 | Grad Max: 0.815410 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001934 | Grad Max: 0.008080 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005929 | Grad Max: 0.005929 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001202 | Grad Max: 0.142191 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022426 | Grad Max: 0.780649 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000247 | Grad Max: 0.006050 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010189 | Grad Max: 0.035803 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000556 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002272 | Grad Max: 0.005989 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000222 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000723 | Grad Max: 0.001952 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001510 | Grad Max: 0.003162 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022649 | Grad Max: 0.022649 [GRADIENT NORM TOTAL] 3.2172 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.376 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50412 0.49588] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.523 | Std: 0.026 [MASKS] A(Pass/Fail): 402/1646 | B: 352/1696 | C: 266/1782 [LOSS Ex1] A: 0.67108 | B: 0.67058 | C: 0.66618 [LOGITS Ex2 A] Mean Abs: 1.518 | Max: 6.054 [LOSS Ex2] A: 0.22322 | B: 0.40400 | C: 0.34783 ** [JOINT LOSS] ** : 0.994294 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002352 | Grad Max: 0.052931 -> Layer: shared_layers.0.bias | Grad Mean: 0.064573 | Grad Max: 0.265717 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.009013 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007601 | Grad Max: 0.007601 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000475 | Grad Max: 0.063473 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008293 | Grad Max: 0.318792 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.003982 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003478 | Grad Max: 0.016676 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000242 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000800 | Grad Max: 0.002522 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000108 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000246 | Grad Max: 0.000865 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000516 | Grad Max: 0.001596 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006831 | Grad Max: 0.006831 [GRADIENT NORM TOTAL] 1.2373 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.387 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029237 0.4970763] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.523 | Std: 0.026 [MASKS] A(Pass/Fail): 376/1672 | B: 315/1541 | C: 256/1792 [LOSS Ex1] A: 0.67015 | B: 0.67325 | C: 0.66850 [LOGITS Ex2 A] Mean Abs: 1.540 | Max: 6.561 [LOSS Ex2] A: 0.24847 | B: 0.39814 | C: 0.34301 ** [JOINT LOSS] ** : 1.000507 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006319 | Grad Max: 0.201275 -> Layer: shared_layers.0.bias | Grad Mean: 0.282365 | Grad Max: 1.316593 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002046 | Grad Max: 0.009073 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007816 | Grad Max: 0.007816 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002089 | Grad Max: 0.183352 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038773 | Grad Max: 0.964984 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000443 | Grad Max: 0.011170 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018239 | Grad Max: 0.065521 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000783 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004064 | Grad Max: 0.009383 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000334 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001281 | Grad Max: 0.003054 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002271 | Grad Max: 0.004702 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037008 | Grad Max: 0.037008 [GRADIENT NORM TOTAL] 5.4662 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.328 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50592303 0.49407697] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.522 | Std: 0.025 [MASKS] A(Pass/Fail): 364/1684 | B: 326/1722 | C: 266/1782 [LOSS Ex1] A: 0.67404 | B: 0.67264 | C: 0.66808 [LOGITS Ex2 A] Mean Abs: 1.524 | Max: 4.838 [LOSS Ex2] A: 0.23728 | B: 0.43039 | C: 0.34034 ** [JOINT LOSS] ** : 1.007587 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006645 | Grad Max: 0.177450 -> Layer: shared_layers.0.bias | Grad Mean: 0.286637 | Grad Max: 1.421772 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001899 | Grad Max: 0.007155 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000616 | Grad Max: 0.000616 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.139695 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038474 | Grad Max: 0.786546 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000435 | Grad Max: 0.010357 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017928 | Grad Max: 0.064260 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000766 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003991 | Grad Max: 0.008894 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000356 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001240 | Grad Max: 0.003073 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002034 | Grad Max: 0.004078 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034346 | Grad Max: 0.034346 [GRADIENT NORM TOTAL] 5.4399 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.055 | Max: 0.225 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5199569 0.48004308] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.522 | Std: 0.025 [MASKS] A(Pass/Fail): 360/1688 | B: 343/1705 | C: 283/1765 [LOSS Ex1] A: 0.67355 | B: 0.67312 | C: 0.66699 [LOGITS Ex2 A] Mean Abs: 1.460 | Max: 5.216 [LOSS Ex2] A: 0.23797 | B: 0.41488 | C: 0.32467 ** [JOINT LOSS] ** : 0.997056 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002701 | Grad Max: 0.068090 -> Layer: shared_layers.0.bias | Grad Mean: 0.072796 | Grad Max: 0.312151 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001919 | Grad Max: 0.007590 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000737 | Grad Max: 0.000737 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000564 | Grad Max: 0.164828 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009186 | Grad Max: 0.932008 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.003221 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001223 | Grad Max: 0.013449 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000171 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000186 | Grad Max: 0.001469 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000060 | Grad Max: 0.000339 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000432 | Grad Max: 0.001066 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000135 | Grad Max: 0.000135 [GRADIENT NORM TOTAL] 2.0217 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.364 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58222324 0.4177768 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.026 [MASKS] A(Pass/Fail): 402/1646 | B: 355/1693 | C: 269/1779 [LOSS Ex1] A: 0.67129 | B: 0.67039 | C: 0.66725 [LOGITS Ex2 A] Mean Abs: 1.499 | Max: 5.233 [LOSS Ex2] A: 0.21575 | B: 0.41152 | C: 0.34408 ** [JOINT LOSS] ** : 0.993426 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003874 | Grad Max: 0.100644 -> Layer: shared_layers.0.bias | Grad Mean: 0.327660 | Grad Max: 1.403558 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.008781 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010068 | Grad Max: 0.010068 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.304582 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039318 | Grad Max: 1.726431 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000428 | Grad Max: 0.011324 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017998 | Grad Max: 0.069744 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000772 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003910 | Grad Max: 0.008931 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000362 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001200 | Grad Max: 0.003089 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001959 | Grad Max: 0.003684 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032609 | Grad Max: 0.032609 [GRADIENT NORM TOTAL] 6.6143 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.408 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50154495 0.49845502] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.026 [MASKS] A(Pass/Fail): 369/1679 | B: 318/1538 | C: 280/1768 [LOSS Ex1] A: 0.67438 | B: 0.67308 | C: 0.66663 [LOGITS Ex2 A] Mean Abs: 1.504 | Max: 5.237 [LOSS Ex2] A: 0.21793 | B: 0.40506 | C: 0.32470 ** [JOINT LOSS] ** : 0.987258 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004840 | Grad Max: 0.144386 -> Layer: shared_layers.0.bias | Grad Mean: 0.268718 | Grad Max: 1.151254 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001915 | Grad Max: 0.007598 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005852 | Grad Max: 0.005852 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001767 | Grad Max: 0.236241 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032420 | Grad Max: 1.343292 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000355 | Grad Max: 0.008804 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014704 | Grad Max: 0.053384 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000672 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003263 | Grad Max: 0.007210 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000289 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000998 | Grad Max: 0.002548 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001685 | Grad Max: 0.003520 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027347 | Grad Max: 0.027347 [GRADIENT NORM TOTAL] 5.1423 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.205 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5750934 0.42490664] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.026 [MASKS] A(Pass/Fail): 374/1674 | B: 328/1720 | C: 256/1792 [LOSS Ex1] A: 0.67260 | B: 0.67246 | C: 0.66919 [LOGITS Ex2 A] Mean Abs: 1.527 | Max: 5.620 [LOSS Ex2] A: 0.24223 | B: 0.41963 | C: 0.34816 ** [JOINT LOSS] ** : 1.008090 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001618 | Grad Max: 0.035603 -> Layer: shared_layers.0.bias | Grad Mean: 0.056163 | Grad Max: 0.359755 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001966 | Grad Max: 0.008038 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005740 | Grad Max: 0.005740 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000515 | Grad Max: 0.112222 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009064 | Grad Max: 0.618828 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000078 | Grad Max: 0.003067 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003032 | Grad Max: 0.016047 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000279 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000619 | Grad Max: 0.002427 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000111 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000190 | Grad Max: 0.000728 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.001359 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005215 | Grad Max: 0.005215 [GRADIENT NORM TOTAL] 1.5684 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.272 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5462899 0.45371008] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.026 [MASKS] A(Pass/Fail): 316/1300 | B: 344/1704 | C: 272/1776 [LOSS Ex1] A: 0.67156 | B: 0.67294 | C: 0.66686 [LOGITS Ex2 A] Mean Abs: 1.559 | Max: 5.165 [LOSS Ex2] A: 0.21532 | B: 0.41460 | C: 0.33156 ** [JOINT LOSS] ** : 0.990946 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002608 | Grad Max: 0.057163 -> Layer: shared_layers.0.bias | Grad Mean: 0.146260 | Grad Max: 0.733907 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002011 | Grad Max: 0.007853 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001548 | Grad Max: 0.001548 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000995 | Grad Max: 0.091345 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017893 | Grad Max: 0.500112 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.006001 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007613 | Grad Max: 0.035374 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000387 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001634 | Grad Max: 0.004400 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000156 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000504 | Grad Max: 0.001369 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000687 | Grad Max: 0.002466 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013273 | Grad Max: 0.013273 [GRADIENT NORM TOTAL] 2.9090 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.410 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50690055 0.49309945] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.524 | Std: 0.026 [MASKS] A(Pass/Fail): 405/1643 | B: 357/1691 | C: 250/1798 [LOSS Ex1] A: 0.67179 | B: 0.67020 | C: 0.66951 [LOGITS Ex2 A] Mean Abs: 1.540 | Max: 6.984 [LOSS Ex2] A: 0.22023 | B: 0.40272 | C: 0.33757 ** [JOINT LOSS] ** : 0.990673 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001998 | Grad Max: 0.079296 -> Layer: shared_layers.0.bias | Grad Mean: 0.031018 | Grad Max: 0.132181 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.008872 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012678 | Grad Max: 0.012678 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000353 | Grad Max: 0.043142 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005686 | Grad Max: 0.240059 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.003210 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001829 | Grad Max: 0.012064 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000309 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000403 | Grad Max: 0.002004 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000095 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000119 | Grad Max: 0.000650 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000459 | Grad Max: 0.001399 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003038 | Grad Max: 0.003038 [GRADIENT NORM TOTAL] 0.8616 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.381 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.504224 0.49577603] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.524 | Std: 0.026 [MASKS] A(Pass/Fail): 410/1638 | B: 320/1536 | C: 269/1779 [LOSS Ex1] A: 0.67073 | B: 0.67290 | C: 0.66718 [LOGITS Ex2 A] Mean Abs: 1.524 | Max: 5.154 [LOSS Ex2] A: 0.21077 | B: 0.38703 | C: 0.33202 ** [JOINT LOSS] ** : 0.980212 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001428 | Grad Max: 0.040353 -> Layer: shared_layers.0.bias | Grad Mean: 0.022715 | Grad Max: 0.145399 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.008988 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012033 | Grad Max: 0.012033 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000316 | Grad Max: 0.044878 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004894 | Grad Max: 0.239344 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.003072 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001149 | Grad Max: 0.010954 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000172 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000191 | Grad Max: 0.001283 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000067 | Grad Max: 0.000457 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000561 | Grad Max: 0.001418 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001355 | Grad Max: 0.001355 [GRADIENT NORM TOTAL] 0.7812 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.391 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50284535 0.49715468] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.524 | Std: 0.026 [MASKS] A(Pass/Fail): 386/1662 | B: 331/1717 | C: 240/1808 [LOSS Ex1] A: 0.66979 | B: 0.67226 | C: 0.67087 [LOGITS Ex2 A] Mean Abs: 1.544 | Max: 5.890 [LOSS Ex2] A: 0.24598 | B: 0.42041 | C: 0.32464 ** [JOINT LOSS] ** : 1.001320 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003274 | Grad Max: 0.124731 -> Layer: shared_layers.0.bias | Grad Mean: 0.056294 | Grad Max: 0.212471 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.009214 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013324 | Grad Max: 0.013324 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000622 | Grad Max: 0.085610 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010090 | Grad Max: 0.472167 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.003576 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003355 | Grad Max: 0.020304 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000270 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000813 | Grad Max: 0.002604 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000130 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000260 | Grad Max: 0.000926 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000416 | Grad Max: 0.001494 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007364 | Grad Max: 0.007364 [GRADIENT NORM TOTAL] 1.5133 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.331 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50582266 0.49417734] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.026 [MASKS] A(Pass/Fail): 373/1675 | B: 346/1702 | C: 186/1190 [LOSS Ex1] A: 0.67373 | B: 0.67273 | C: 0.66643 [LOGITS Ex2 A] Mean Abs: 1.505 | Max: 5.847 [LOSS Ex2] A: 0.22284 | B: 0.42331 | C: 0.34374 ** [JOINT LOSS] ** : 1.000926 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003407 | Grad Max: 0.092639 -> Layer: shared_layers.0.bias | Grad Mean: 0.190223 | Grad Max: 0.864598 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001888 | Grad Max: 0.007099 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000827 | Grad Max: 0.000827 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001324 | Grad Max: 0.162383 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025033 | Grad Max: 0.924810 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000292 | Grad Max: 0.009206 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012005 | Grad Max: 0.050690 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000661 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002605 | Grad Max: 0.006412 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000264 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000802 | Grad Max: 0.002173 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001442 | Grad Max: 0.002849 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023006 | Grad Max: 0.023006 [GRADIENT NORM TOTAL] 3.6809 [EPOCH SUMMARY] Train Loss: 0.9974 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9696 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9734 -> New: 0.9696) ############################## EPOCH 54/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.056 | Max: 0.228 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52036434 0.47963566] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.523 | Std: 0.025 [MASKS] A(Pass/Fail): 379/1669 | B: 357/1691 | C: 265/1783 [LOSS Ex1] A: 0.67323 | B: 0.66997 | C: 0.66715 [LOGITS Ex2 A] Mean Abs: 1.479 | Max: 5.363 [LOSS Ex2] A: 0.23278 | B: 0.40564 | C: 0.34775 ** [JOINT LOSS] ** : 0.998837 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003609 | Grad Max: 0.123441 -> Layer: shared_layers.0.bias | Grad Mean: 0.050923 | Grad Max: 0.250954 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002052 | Grad Max: 0.008137 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001746 | Grad Max: 0.001746 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000535 | Grad Max: 0.059790 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008156 | Grad Max: 0.332946 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.003297 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001622 | Grad Max: 0.014949 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000244 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000317 | Grad Max: 0.001747 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000101 | Grad Max: 0.000502 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000331 | Grad Max: 0.001179 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002952 | Grad Max: 0.002952 [GRADIENT NORM TOTAL] 1.2583 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.369 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58420956 0.41579038] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.027 [MASKS] A(Pass/Fail): 417/1631 | B: 323/1533 | C: 268/1780 [LOSS Ex1] A: 0.67090 | B: 0.67267 | C: 0.66767 [LOGITS Ex2 A] Mean Abs: 1.536 | Max: 6.534 [LOSS Ex2] A: 0.21932 | B: 0.39828 | C: 0.33608 ** [JOINT LOSS] ** : 0.988308 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004169 | Grad Max: 0.091656 -> Layer: shared_layers.0.bias | Grad Mean: 0.211753 | Grad Max: 0.934285 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002023 | Grad Max: 0.008742 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009575 | Grad Max: 0.009575 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001399 | Grad Max: 0.160608 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026198 | Grad Max: 0.855773 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000277 | Grad Max: 0.008389 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011517 | Grad Max: 0.048313 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000571 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002506 | Grad Max: 0.006086 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000221 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000776 | Grad Max: 0.001994 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001336 | Grad Max: 0.003350 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022630 | Grad Max: 0.022630 [GRADIENT NORM TOTAL] 4.1326 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.413 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013467 0.49865335] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.026 [MASKS] A(Pass/Fail): 401/1647 | B: 331/1717 | C: 244/1804 [LOSS Ex1] A: 0.67404 | B: 0.67203 | C: 0.66870 [LOGITS Ex2 A] Mean Abs: 1.547 | Max: 5.341 [LOSS Ex2] A: 0.21557 | B: 0.42174 | C: 0.32804 ** [JOINT LOSS] ** : 0.993372 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003079 | Grad Max: 0.076270 -> Layer: shared_layers.0.bias | Grad Mean: 0.148041 | Grad Max: 0.608153 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001864 | Grad Max: 0.007243 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002354 | Grad Max: 0.002354 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000921 | Grad Max: 0.117147 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017357 | Grad Max: 0.624951 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000182 | Grad Max: 0.004779 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007520 | Grad Max: 0.028570 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000470 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001611 | Grad Max: 0.004208 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000171 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000489 | Grad Max: 0.001423 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000755 | Grad Max: 0.002199 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013482 | Grad Max: 0.013482 [GRADIENT NORM TOTAL] 2.7231 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.210 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57677394 0.42322603] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.026 [MASKS] A(Pass/Fail): 426/1622 | B: 347/1701 | C: 287/1761 [LOSS Ex1] A: 0.67222 | B: 0.67251 | C: 0.66531 [LOGITS Ex2 A] Mean Abs: 1.531 | Max: 5.671 [LOSS Ex2] A: 0.23988 | B: 0.41736 | C: 0.31774 ** [JOINT LOSS] ** : 0.995007 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002621 | Grad Max: 0.087867 -> Layer: shared_layers.0.bias | Grad Mean: 0.208167 | Grad Max: 1.075573 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002024 | Grad Max: 0.008169 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005003 | Grad Max: 0.005003 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001447 | Grad Max: 0.239179 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026887 | Grad Max: 1.338233 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000296 | Grad Max: 0.008550 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012558 | Grad Max: 0.055307 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000529 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002707 | Grad Max: 0.006377 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000225 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000823 | Grad Max: 0.002027 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001323 | Grad Max: 0.002517 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022548 | Grad Max: 0.022548 [GRADIENT NORM TOTAL] 4.4152 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.278 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54743934 0.45256066] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 352/1264 | B: 359/1689 | C: 254/1794 [LOSS Ex1] A: 0.67118 | B: 0.66974 | C: 0.66737 [LOGITS Ex2 A] Mean Abs: 1.552 | Max: 5.568 [LOSS Ex2] A: 0.21672 | B: 0.40943 | C: 0.34192 ** [JOINT LOSS] ** : 0.992123 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003322 | Grad Max: 0.109615 -> Layer: shared_layers.0.bias | Grad Mean: 0.284310 | Grad Max: 1.347645 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.008006 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002747 | Grad Max: 0.002747 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.262886 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035858 | Grad Max: 1.474252 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000403 | Grad Max: 0.010913 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016833 | Grad Max: 0.068274 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000731 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003632 | Grad Max: 0.008028 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000316 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001113 | Grad Max: 0.002847 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001914 | Grad Max: 0.003624 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031472 | Grad Max: 0.031472 [GRADIENT NORM TOTAL] 5.8695 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.415 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067163 0.4932837] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 461/1587 | B: 324/1532 | C: 280/1768 [LOSS Ex1] A: 0.67139 | B: 0.67247 | C: 0.66468 [LOGITS Ex2 A] Mean Abs: 1.570 | Max: 6.589 [LOSS Ex2] A: 0.22507 | B: 0.38593 | C: 0.32489 ** [JOINT LOSS] ** : 0.981473 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003968 | Grad Max: 0.147866 -> Layer: shared_layers.0.bias | Grad Mean: 0.061995 | Grad Max: 0.299099 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.008078 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003549 | Grad Max: 0.003549 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000582 | Grad Max: 0.156323 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008884 | Grad Max: 0.888538 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000059 | Grad Max: 0.003390 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001417 | Grad Max: 0.016398 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000235 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000249 | Grad Max: 0.001242 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000066 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000083 | Grad Max: 0.000431 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000428 | Grad Max: 0.001364 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002745 | Grad Max: 0.002745 [GRADIENT NORM TOTAL] 1.7231 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.385 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044357 0.49556428] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 467/1581 | B: 338/1710 | C: 272/1776 [LOSS Ex1] A: 0.67030 | B: 0.67182 | C: 0.66856 [LOGITS Ex2 A] Mean Abs: 1.585 | Max: 5.457 [LOSS Ex2] A: 0.22455 | B: 0.42065 | C: 0.37523 ** [JOINT LOSS] ** : 1.010373 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004899 | Grad Max: 0.116117 -> Layer: shared_layers.0.bias | Grad Mean: 0.244646 | Grad Max: 1.117230 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.008658 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008555 | Grad Max: 0.008555 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001660 | Grad Max: 0.140354 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031576 | Grad Max: 0.719286 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000352 | Grad Max: 0.010582 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014748 | Grad Max: 0.056234 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000613 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003198 | Grad Max: 0.007219 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000286 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000955 | Grad Max: 0.002438 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001533 | Grad Max: 0.002936 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025043 | Grad Max: 0.025043 [GRADIENT NORM TOTAL] 4.6263 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.396 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026327 0.49736732] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 462/1586 | B: 348/1700 | C: 264/1784 [LOSS Ex1] A: 0.66937 | B: 0.67231 | C: 0.66894 [LOGITS Ex2 A] Mean Abs: 1.572 | Max: 6.376 [LOSS Ex2] A: 0.23283 | B: 0.41003 | C: 0.33421 ** [JOINT LOSS] ** : 0.995896 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002196 | Grad Max: 0.054845 -> Layer: shared_layers.0.bias | Grad Mean: 0.106950 | Grad Max: 0.489642 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002027 | Grad Max: 0.008504 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006525 | Grad Max: 0.006525 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000826 | Grad Max: 0.081543 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014932 | Grad Max: 0.461901 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000148 | Grad Max: 0.006287 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006173 | Grad Max: 0.031304 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000294 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001341 | Grad Max: 0.003352 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000137 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000408 | Grad Max: 0.001104 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000535 | Grad Max: 0.001619 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010428 | Grad Max: 0.010428 [GRADIENT NORM TOTAL] 2.2486 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.335 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056467 0.49435326] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.026 [MASKS] A(Pass/Fail): 438/1610 | B: 363/1685 | C: 258/1790 [LOSS Ex1] A: 0.67339 | B: 0.66954 | C: 0.66785 [LOGITS Ex2 A] Mean Abs: 1.509 | Max: 5.487 [LOSS Ex2] A: 0.22348 | B: 0.39584 | C: 0.33528 ** [JOINT LOSS] ** : 0.988463 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007327 | Grad Max: 0.194904 -> Layer: shared_layers.0.bias | Grad Mean: 0.299635 | Grad Max: 1.234077 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001939 | Grad Max: 0.007038 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001990 | Grad Max: 0.001990 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.288539 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039674 | Grad Max: 1.517904 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000437 | Grad Max: 0.009613 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018101 | Grad Max: 0.064140 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000827 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003980 | Grad Max: 0.009072 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000316 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001216 | Grad Max: 0.002831 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002213 | Grad Max: 0.004123 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034282 | Grad Max: 0.034282 [GRADIENT NORM TOTAL] 5.7060 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.057 | Max: 0.230 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5207709 0.47922912] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.523 | Std: 0.026 [MASKS] A(Pass/Fail): 452/1596 | B: 327/1529 | C: 258/1790 [LOSS Ex1] A: 0.67289 | B: 0.67228 | C: 0.66995 [LOGITS Ex2 A] Mean Abs: 1.482 | Max: 6.280 [LOSS Ex2] A: 0.24207 | B: 0.39248 | C: 0.33389 ** [JOINT LOSS] ** : 0.994521 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008260 | Grad Max: 0.237691 -> Layer: shared_layers.0.bias | Grad Mean: 0.330052 | Grad Max: 1.347302 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.008361 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012424 | Grad Max: 0.012424 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.298153 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042445 | Grad Max: 1.560057 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000467 | Grad Max: 0.010486 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019220 | Grad Max: 0.062898 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000813 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004253 | Grad Max: 0.009445 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000361 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001302 | Grad Max: 0.003124 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002278 | Grad Max: 0.004237 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036398 | Grad Max: 0.036398 [GRADIENT NORM TOTAL] 6.1678 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.374 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5861117 0.41388825] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.027 [MASKS] A(Pass/Fail): 502/1546 | B: 343/1705 | C: 264/1784 [LOSS Ex1] A: 0.67051 | B: 0.67164 | C: 0.66672 [LOGITS Ex2 A] Mean Abs: 1.560 | Max: 7.000 [LOSS Ex2] A: 0.21823 | B: 0.42421 | C: 0.33566 ** [JOINT LOSS] ** : 0.995659 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004011 | Grad Max: 0.138884 -> Layer: shared_layers.0.bias | Grad Mean: 0.070322 | Grad Max: 0.378126 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.008424 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007607 | Grad Max: 0.007607 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000649 | Grad Max: 0.150712 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010752 | Grad Max: 0.819787 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.004068 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003978 | Grad Max: 0.021398 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000266 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000949 | Grad Max: 0.003166 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000119 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000291 | Grad Max: 0.000996 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000549 | Grad Max: 0.001687 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007701 | Grad Max: 0.007701 [GRADIENT NORM TOTAL] 1.7495 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.418 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012262 0.4987738] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 457/1591 | B: 350/1698 | C: 256/1792 [LOSS Ex1] A: 0.67372 | B: 0.67213 | C: 0.66824 [LOGITS Ex2 A] Mean Abs: 1.603 | Max: 5.391 [LOSS Ex2] A: 0.22418 | B: 0.43522 | C: 0.32433 ** [JOINT LOSS] ** : 0.999274 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004983 | Grad Max: 0.167502 -> Layer: shared_layers.0.bias | Grad Mean: 0.486258 | Grad Max: 2.232906 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001936 | Grad Max: 0.007614 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008170 | Grad Max: 0.008170 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003102 | Grad Max: 0.282982 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058480 | Grad Max: 1.593680 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000657 | Grad Max: 0.017556 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027811 | Grad Max: 0.109822 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001088 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005884 | Grad Max: 0.012938 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000460 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001772 | Grad Max: 0.004230 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002849 | Grad Max: 0.004987 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048043 | Grad Max: 0.048043 [GRADIENT NORM TOTAL] 9.4578 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.213 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57814455 0.42185542] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 465/1583 | B: 366/1682 | C: 267/1781 [LOSS Ex1] A: 0.67188 | B: 0.66935 | C: 0.66613 [LOGITS Ex2 A] Mean Abs: 1.611 | Max: 5.577 [LOSS Ex2] A: 0.24083 | B: 0.43190 | C: 0.33835 ** [JOINT LOSS] ** : 1.006149 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009386 | Grad Max: 0.213203 -> Layer: shared_layers.0.bias | Grad Mean: 0.625462 | Grad Max: 2.809701 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.007992 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003301 | Grad Max: 0.003301 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004160 | Grad Max: 0.361935 -> Layer: exit2_layers.0.bias | Grad Mean: 0.078758 | Grad Max: 1.954845 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000871 | Grad Max: 0.025466 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036788 | Grad Max: 0.157678 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000124 | Grad Max: 0.001567 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007974 | Grad Max: 0.017612 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.000578 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002417 | Grad Max: 0.005635 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004059 | Grad Max: 0.006842 -> Layer: exit2_layers.12.bias | Grad Mean: 0.067203 | Grad Max: 0.067203 [GRADIENT NORM TOTAL] 12.1140 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.283 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54839236 0.4516076 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.027 [MASKS] A(Pass/Fail): 378/1238 | B: 329/1527 | C: 162/1214 [LOSS Ex1] A: 0.67084 | B: 0.67211 | C: 0.66997 [LOGITS Ex2 A] Mean Abs: 1.633 | Max: 5.443 [LOSS Ex2] A: 0.22069 | B: 0.40273 | C: 0.36722 ** [JOINT LOSS] ** : 1.001192 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007105 | Grad Max: 0.147659 -> Layer: shared_layers.0.bias | Grad Mean: 0.403199 | Grad Max: 1.796261 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001959 | Grad Max: 0.008594 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006393 | Grad Max: 0.006393 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002760 | Grad Max: 0.242311 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052360 | Grad Max: 1.336626 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000582 | Grad Max: 0.014476 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024406 | Grad Max: 0.093882 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.001071 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005273 | Grad Max: 0.012435 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000420 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001591 | Grad Max: 0.003893 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002696 | Grad Max: 0.004817 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043769 | Grad Max: 0.043769 [GRADIENT NORM TOTAL] 7.8455 [EPOCH SUMMARY] Train Loss: 0.9958 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9662 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9696 -> New: 0.9662) ############################## EPOCH 55/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.420 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50663364 0.49336636] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 498/1550 | B: 344/1704 | C: 268/1780 [LOSS Ex1] A: 0.67106 | B: 0.67147 | C: 0.66787 [LOGITS Ex2 A] Mean Abs: 1.571 | Max: 6.800 [LOSS Ex2] A: 0.22213 | B: 0.42043 | C: 0.34683 ** [JOINT LOSS] ** : 0.999931 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002518 | Grad Max: 0.059306 -> Layer: shared_layers.0.bias | Grad Mean: 0.112263 | Grad Max: 0.562740 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002012 | Grad Max: 0.008163 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007483 | Grad Max: 0.007483 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000775 | Grad Max: 0.084069 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013476 | Grad Max: 0.467730 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.005138 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004817 | Grad Max: 0.024322 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000272 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001011 | Grad Max: 0.003090 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000117 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000304 | Grad Max: 0.000925 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000536 | Grad Max: 0.001406 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008298 | Grad Max: 0.008298 [GRADIENT NORM TOTAL] 2.2202 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.389 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5045645 0.49543545] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 486/1562 | B: 351/1697 | C: 237/1811 [LOSS Ex1] A: 0.66996 | B: 0.67198 | C: 0.67001 [LOGITS Ex2 A] Mean Abs: 1.538 | Max: 6.185 [LOSS Ex2] A: 0.21066 | B: 0.42462 | C: 0.35055 ** [JOINT LOSS] ** : 0.999262 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003029 | Grad Max: 0.108599 -> Layer: shared_layers.0.bias | Grad Mean: 0.285188 | Grad Max: 1.373014 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.009042 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014478 | Grad Max: 0.014478 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001948 | Grad Max: 0.193938 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036302 | Grad Max: 1.099221 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000406 | Grad Max: 0.011683 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017166 | Grad Max: 0.072385 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000698 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003675 | Grad Max: 0.008226 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000310 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001120 | Grad Max: 0.002991 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001924 | Grad Max: 0.004048 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031861 | Grad Max: 0.031861 [GRADIENT NORM TOTAL] 5.8286 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.400 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025035 0.49749646] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 479/1569 | B: 367/1681 | C: 261/1787 [LOSS Ex1] A: 0.66904 | B: 0.66920 | C: 0.66656 [LOGITS Ex2 A] Mean Abs: 1.539 | Max: 7.277 [LOSS Ex2] A: 0.23645 | B: 0.40485 | C: 0.33676 ** [JOINT LOSS] ** : 0.994290 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003203 | Grad Max: 0.095218 -> Layer: shared_layers.0.bias | Grad Mean: 0.168527 | Grad Max: 0.843352 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.008568 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005741 | Grad Max: 0.005741 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001235 | Grad Max: 0.141236 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021645 | Grad Max: 0.794843 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000221 | Grad Max: 0.007680 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009162 | Grad Max: 0.047818 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000467 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001899 | Grad Max: 0.004842 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000163 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000572 | Grad Max: 0.001514 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000933 | Grad Max: 0.001934 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015608 | Grad Max: 0.015608 [GRADIENT NORM TOTAL] 3.4140 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.338 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50551486 0.49448508] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.523 | Std: 0.027 [MASKS] A(Pass/Fail): 449/1599 | B: 332/1524 | C: 267/1781 [LOSS Ex1] A: 0.67314 | B: 0.67197 | C: 0.66740 [LOGITS Ex2 A] Mean Abs: 1.564 | Max: 5.390 [LOSS Ex2] A: 0.21381 | B: 0.39428 | C: 0.31886 ** [JOINT LOSS] ** : 0.979821 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004613 | Grad Max: 0.122113 -> Layer: shared_layers.0.bias | Grad Mean: 0.184761 | Grad Max: 0.772406 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001887 | Grad Max: 0.007273 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003147 | Grad Max: 0.003147 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001287 | Grad Max: 0.134309 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023895 | Grad Max: 0.686279 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000244 | Grad Max: 0.008418 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010264 | Grad Max: 0.042376 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000458 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002283 | Grad Max: 0.005245 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000253 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000708 | Grad Max: 0.001925 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001209 | Grad Max: 0.003126 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020535 | Grad Max: 0.020535 [GRADIENT NORM TOTAL] 3.5221 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.232 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5210937 0.47890627] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.523 | Std: 0.026 [MASKS] A(Pass/Fail): 463/1585 | B: 343/1705 | C: 265/1783 [LOSS Ex1] A: 0.67265 | B: 0.67132 | C: 0.66734 [LOGITS Ex2 A] Mean Abs: 1.521 | Max: 5.919 [LOSS Ex2] A: 0.22782 | B: 0.42709 | C: 0.31628 ** [JOINT LOSS] ** : 0.994169 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004782 | Grad Max: 0.121377 -> Layer: shared_layers.0.bias | Grad Mean: 0.260292 | Grad Max: 1.156800 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001894 | Grad Max: 0.007540 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001344 | Grad Max: 0.001344 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001734 | Grad Max: 0.145440 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032446 | Grad Max: 0.771354 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000342 | Grad Max: 0.009389 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014400 | Grad Max: 0.053599 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000675 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003149 | Grad Max: 0.007292 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000278 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000959 | Grad Max: 0.002423 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001513 | Grad Max: 0.003531 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026486 | Grad Max: 0.026486 [GRADIENT NORM TOTAL] 4.9462 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.378 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58758956 0.41241047] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 515/1533 | B: 353/1695 | C: 269/1779 [LOSS Ex1] A: 0.67023 | B: 0.67183 | C: 0.66664 [LOGITS Ex2 A] Mean Abs: 1.546 | Max: 5.848 [LOSS Ex2] A: 0.21451 | B: 0.41618 | C: 0.32045 ** [JOINT LOSS] ** : 0.986614 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001368 | Grad Max: 0.031483 -> Layer: shared_layers.0.bias | Grad Mean: 0.031191 | Grad Max: 0.188174 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.008676 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012512 | Grad Max: 0.012512 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000324 | Grad Max: 0.055501 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005297 | Grad Max: 0.282298 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.002350 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001188 | Grad Max: 0.009825 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000193 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000209 | Grad Max: 0.001739 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000094 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000068 | Grad Max: 0.000470 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000388 | Grad Max: 0.000991 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000654 | Grad Max: 0.000654 [GRADIENT NORM TOTAL] 0.9112 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.422 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50107557 0.4989245 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 475/1573 | B: 367/1681 | C: 231/1817 [LOSS Ex1] A: 0.67348 | B: 0.66904 | C: 0.67021 [LOGITS Ex2 A] Mean Abs: 1.529 | Max: 5.244 [LOSS Ex2] A: 0.22482 | B: 0.40908 | C: 0.32146 ** [JOINT LOSS] ** : 0.989363 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005805 | Grad Max: 0.171927 -> Layer: shared_layers.0.bias | Grad Mean: 0.240105 | Grad Max: 1.058137 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001927 | Grad Max: 0.007388 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006077 | Grad Max: 0.006077 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001706 | Grad Max: 0.164417 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031428 | Grad Max: 0.904450 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.007577 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013804 | Grad Max: 0.049004 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000607 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003062 | Grad Max: 0.006961 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000292 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000925 | Grad Max: 0.002500 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001526 | Grad Max: 0.003124 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024888 | Grad Max: 0.024888 [GRADIENT NORM TOTAL] 4.6365 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.217 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57930404 0.42069596] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 483/1565 | B: 333/1523 | C: 288/1760 [LOSS Ex1] A: 0.67162 | B: 0.67181 | C: 0.66447 [LOGITS Ex2 A] Mean Abs: 1.535 | Max: 5.410 [LOSS Ex2] A: 0.23014 | B: 0.39575 | C: 0.33706 ** [JOINT LOSS] ** : 0.990286 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004534 | Grad Max: 0.128877 -> Layer: shared_layers.0.bias | Grad Mean: 0.180708 | Grad Max: 0.714372 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002018 | Grad Max: 0.007882 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001098 | Grad Max: 0.001098 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001330 | Grad Max: 0.142775 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024705 | Grad Max: 0.800091 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000251 | Grad Max: 0.007226 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010386 | Grad Max: 0.044139 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000464 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002311 | Grad Max: 0.005423 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000197 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000701 | Grad Max: 0.001924 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001148 | Grad Max: 0.002391 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018771 | Grad Max: 0.018771 [GRADIENT NORM TOTAL] 3.6239 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.287 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54917824 0.45082176] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 389/1227 | B: 343/1705 | C: 295/1753 [LOSS Ex1] A: 0.67058 | B: 0.67115 | C: 0.66444 [LOGITS Ex2 A] Mean Abs: 1.611 | Max: 5.441 [LOSS Ex2] A: 0.20764 | B: 0.41479 | C: 0.36391 ** [JOINT LOSS] ** : 0.997510 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002915 | Grad Max: 0.075545 -> Layer: shared_layers.0.bias | Grad Mean: 0.220110 | Grad Max: 1.018529 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.007556 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006316 | Grad Max: 0.006316 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001411 | Grad Max: 0.125342 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026231 | Grad Max: 0.703906 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.008869 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011984 | Grad Max: 0.051104 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000517 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002553 | Grad Max: 0.005931 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000216 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000755 | Grad Max: 0.001896 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000983 | Grad Max: 0.002245 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018893 | Grad Max: 0.018893 [GRADIENT NORM TOTAL] 4.1665 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.423 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5064943 0.4935057] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 506/1542 | B: 353/1695 | C: 254/1794 [LOSS Ex1] A: 0.67079 | B: 0.67167 | C: 0.66775 [LOGITS Ex2 A] Mean Abs: 1.584 | Max: 7.361 [LOSS Ex2] A: 0.22555 | B: 0.41618 | C: 0.32264 ** [JOINT LOSS] ** : 0.991527 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004600 | Grad Max: 0.126267 -> Layer: shared_layers.0.bias | Grad Mean: 0.317400 | Grad Max: 1.593096 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.008422 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010304 | Grad Max: 0.010304 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.185435 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039606 | Grad Max: 1.046603 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000445 | Grad Max: 0.011358 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018871 | Grad Max: 0.072868 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000769 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004098 | Grad Max: 0.009437 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000341 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001252 | Grad Max: 0.003075 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002068 | Grad Max: 0.004267 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034854 | Grad Max: 0.034854 [GRADIENT NORM TOTAL] 6.1534 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.392 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5047814 0.49521858] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 497/1551 | B: 367/1681 | C: 268/1780 [LOSS Ex1] A: 0.66968 | B: 0.66887 | C: 0.66680 [LOGITS Ex2 A] Mean Abs: 1.574 | Max: 5.474 [LOSS Ex2] A: 0.21383 | B: 0.40143 | C: 0.35391 ** [JOINT LOSS] ** : 0.991508 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003101 | Grad Max: 0.109556 -> Layer: shared_layers.0.bias | Grad Mean: 0.087221 | Grad Max: 0.376075 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.009235 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012364 | Grad Max: 0.012364 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000733 | Grad Max: 0.124963 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013083 | Grad Max: 0.706277 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.004285 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005152 | Grad Max: 0.025895 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000328 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001148 | Grad Max: 0.003583 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000115 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000339 | Grad Max: 0.000929 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000436 | Grad Max: 0.001580 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008592 | Grad Max: 0.008592 [GRADIENT NORM TOTAL] 2.0558 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.403 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023418 0.49765816] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 488/1560 | B: 333/1523 | C: 280/1768 [LOSS Ex1] A: 0.66876 | B: 0.67166 | C: 0.66543 [LOGITS Ex2 A] Mean Abs: 1.520 | Max: 6.475 [LOSS Ex2] A: 0.23086 | B: 0.41564 | C: 0.36179 ** [JOINT LOSS] ** : 1.004714 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006735 | Grad Max: 0.169788 -> Layer: shared_layers.0.bias | Grad Mean: 0.488209 | Grad Max: 2.153198 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.008744 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006301 | Grad Max: 0.006301 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003194 | Grad Max: 0.312685 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060452 | Grad Max: 1.728952 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000657 | Grad Max: 0.017315 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028073 | Grad Max: 0.113120 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001177 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006065 | Grad Max: 0.012933 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000469 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001847 | Grad Max: 0.004406 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003104 | Grad Max: 0.006392 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052114 | Grad Max: 0.052114 [GRADIENT NORM TOTAL] 9.5048 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.341 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5053482 0.4946518] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 457/1591 | B: 344/1704 | C: 277/1771 [LOSS Ex1] A: 0.67291 | B: 0.67099 | C: 0.66595 [LOGITS Ex2 A] Mean Abs: 1.487 | Max: 5.724 [LOSS Ex2] A: 0.22237 | B: 0.45174 | C: 0.32769 ** [JOINT LOSS] ** : 1.003883 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009995 | Grad Max: 0.210811 -> Layer: shared_layers.0.bias | Grad Mean: 0.615201 | Grad Max: 2.798245 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001918 | Grad Max: 0.006816 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003427 | Grad Max: 0.003427 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004111 | Grad Max: 0.403556 -> Layer: exit2_layers.0.bias | Grad Mean: 0.077703 | Grad Max: 2.242881 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000855 | Grad Max: 0.021174 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036248 | Grad Max: 0.136252 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000121 | Grad Max: 0.001467 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007896 | Grad Max: 0.017028 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000595 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002396 | Grad Max: 0.005790 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003881 | Grad Max: 0.007585 -> Layer: exit2_layers.12.bias | Grad Mean: 0.065588 | Grad Max: 0.065588 [GRADIENT NORM TOTAL] 11.8690 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.058 | Max: 0.234 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52144825 0.47855178] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 471/1577 | B: 354/1694 | C: 168/1208 [LOSS Ex1] A: 0.67243 | B: 0.67152 | C: 0.66800 [LOGITS Ex2 A] Mean Abs: 1.468 | Max: 5.226 [LOSS Ex2] A: 0.22732 | B: 0.42627 | C: 0.33586 ** [JOINT LOSS] ** : 1.000464 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006784 | Grad Max: 0.151198 -> Layer: shared_layers.0.bias | Grad Mean: 0.440531 | Grad Max: 1.939035 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001895 | Grad Max: 0.007706 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000336 | Grad Max: 0.000336 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002864 | Grad Max: 0.336818 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054035 | Grad Max: 1.823129 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000604 | Grad Max: 0.016252 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025634 | Grad Max: 0.103299 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.001017 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005576 | Grad Max: 0.011864 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000450 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001693 | Grad Max: 0.004029 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002798 | Grad Max: 0.004956 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046616 | Grad Max: 0.046616 [GRADIENT NORM TOTAL] 8.4295 [EPOCH SUMMARY] Train Loss: 0.9945 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9640 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9662 -> New: 0.9640) ############################## EPOCH 56/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.382 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58903766 0.41096234] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.028 [MASKS] A(Pass/Fail): 525/1523 | B: 368/1680 | C: 275/1773 [LOSS Ex1] A: 0.66996 | B: 0.66872 | C: 0.66713 [LOGITS Ex2 A] Mean Abs: 1.563 | Max: 5.646 [LOSS Ex2] A: 0.20662 | B: 0.39923 | C: 0.33695 ** [JOINT LOSS] ** : 0.982867 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003445 | Grad Max: 0.100401 -> Layer: shared_layers.0.bias | Grad Mean: 0.146184 | Grad Max: 0.632074 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.008186 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006683 | Grad Max: 0.006683 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001066 | Grad Max: 0.117909 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019510 | Grad Max: 0.669744 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000202 | Grad Max: 0.005422 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008413 | Grad Max: 0.029823 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000460 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001866 | Grad Max: 0.004697 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000184 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000578 | Grad Max: 0.001619 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000906 | Grad Max: 0.002648 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016099 | Grad Max: 0.016099 [GRADIENT NORM TOTAL] 3.0094 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.426 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008962 0.4991038] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 492/1556 | B: 335/1521 | C: 264/1784 [LOSS Ex1] A: 0.67325 | B: 0.67152 | C: 0.66794 [LOGITS Ex2 A] Mean Abs: 1.574 | Max: 5.434 [LOSS Ex2] A: 0.20754 | B: 0.39914 | C: 0.32438 ** [JOINT LOSS] ** : 0.981254 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003681 | Grad Max: 0.110131 -> Layer: shared_layers.0.bias | Grad Mean: 0.281395 | Grad Max: 1.288144 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001953 | Grad Max: 0.007934 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009977 | Grad Max: 0.009977 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001907 | Grad Max: 0.167311 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035942 | Grad Max: 0.935129 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000404 | Grad Max: 0.009780 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017253 | Grad Max: 0.062904 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000674 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003747 | Grad Max: 0.008052 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000318 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001147 | Grad Max: 0.003041 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001786 | Grad Max: 0.004269 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031968 | Grad Max: 0.031968 [GRADIENT NORM TOTAL] 5.5779 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.220 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58037364 0.41962636] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 492/1556 | B: 350/1698 | C: 257/1791 [LOSS Ex1] A: 0.67139 | B: 0.67085 | C: 0.66769 [LOGITS Ex2 A] Mean Abs: 1.574 | Max: 5.553 [LOSS Ex2] A: 0.22663 | B: 0.42065 | C: 0.33568 ** [JOINT LOSS] ** : 0.997629 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001737 | Grad Max: 0.038690 -> Layer: shared_layers.0.bias | Grad Mean: 0.089255 | Grad Max: 0.369024 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001979 | Grad Max: 0.007906 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004798 | Grad Max: 0.004798 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000654 | Grad Max: 0.063520 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011413 | Grad Max: 0.348614 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000109 | Grad Max: 0.005321 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004431 | Grad Max: 0.029333 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000282 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000892 | Grad Max: 0.002975 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000122 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000261 | Grad Max: 0.000867 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000364 | Grad Max: 0.001329 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006770 | Grad Max: 0.006770 [GRADIENT NORM TOTAL] 1.8294 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.291 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.549896 0.450104] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 402/1214 | B: 354/1694 | C: 262/1786 [LOSS Ex1] A: 0.67034 | B: 0.67138 | C: 0.66471 [LOGITS Ex2 A] Mean Abs: 1.567 | Max: 6.065 [LOSS Ex2] A: 0.21162 | B: 0.41992 | C: 0.33547 ** [JOINT LOSS] ** : 0.991144 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006450 | Grad Max: 0.139263 -> Layer: shared_layers.0.bias | Grad Mean: 0.371871 | Grad Max: 1.794217 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.007755 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001635 | Grad Max: 0.001635 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002521 | Grad Max: 0.187365 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047381 | Grad Max: 1.044283 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.013181 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022514 | Grad Max: 0.081205 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000948 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004929 | Grad Max: 0.010758 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000385 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001507 | Grad Max: 0.003463 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002596 | Grad Max: 0.004558 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042820 | Grad Max: 0.042820 [GRADIENT NORM TOTAL] 7.0890 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.427 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063848 0.4936152] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 523/1525 | B: 372/1676 | C: 269/1779 [LOSS Ex1] A: 0.67054 | B: 0.66857 | C: 0.66541 [LOGITS Ex2 A] Mean Abs: 1.549 | Max: 7.478 [LOSS Ex2] A: 0.22302 | B: 0.42232 | C: 0.32901 ** [JOINT LOSS] ** : 0.992957 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007001 | Grad Max: 0.172973 -> Layer: shared_layers.0.bias | Grad Mean: 0.373512 | Grad Max: 1.652868 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.008132 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005964 | Grad Max: 0.005964 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002506 | Grad Max: 0.211143 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046842 | Grad Max: 1.159357 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000522 | Grad Max: 0.013269 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021995 | Grad Max: 0.079514 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000940 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004837 | Grad Max: 0.010839 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000373 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001467 | Grad Max: 0.003513 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002313 | Grad Max: 0.004150 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039560 | Grad Max: 0.039560 [GRADIENT NORM TOTAL] 7.0920 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.396 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5049813 0.4950187] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 520/1528 | B: 336/1520 | C: 277/1771 [LOSS Ex1] A: 0.66941 | B: 0.67138 | C: 0.66429 [LOGITS Ex2 A] Mean Abs: 1.570 | Max: 5.267 [LOSS Ex2] A: 0.21141 | B: 0.39192 | C: 0.32526 ** [JOINT LOSS] ** : 0.977893 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.049330 -> Layer: shared_layers.0.bias | Grad Mean: 0.126740 | Grad Max: 0.574279 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.008835 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007731 | Grad Max: 0.007731 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000846 | Grad Max: 0.144941 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015059 | Grad Max: 0.812634 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000150 | Grad Max: 0.005020 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006364 | Grad Max: 0.030090 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000301 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001362 | Grad Max: 0.003570 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000153 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000406 | Grad Max: 0.001330 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000624 | Grad Max: 0.001832 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010111 | Grad Max: 0.010111 [GRADIENT NORM TOTAL] 2.5904 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.407 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50220096 0.497799 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 503/1545 | B: 351/1697 | C: 286/1762 [LOSS Ex1] A: 0.66850 | B: 0.67071 | C: 0.66554 [LOGITS Ex2 A] Mean Abs: 1.612 | Max: 6.825 [LOSS Ex2] A: 0.24349 | B: 0.42401 | C: 0.34641 ** [JOINT LOSS] ** : 1.006221 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009104 | Grad Max: 0.256061 -> Layer: shared_layers.0.bias | Grad Mean: 0.486404 | Grad Max: 2.165890 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.008990 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009262 | Grad Max: 0.009262 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003384 | Grad Max: 0.251605 -> Layer: exit2_layers.0.bias | Grad Mean: 0.063290 | Grad Max: 1.376480 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000683 | Grad Max: 0.016234 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028959 | Grad Max: 0.105601 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001147 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006389 | Grad Max: 0.013754 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000525 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001941 | Grad Max: 0.004881 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003074 | Grad Max: 0.005371 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052688 | Grad Max: 0.052688 [GRADIENT NORM TOTAL] 9.2630 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5052187 0.4947813] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.028 [MASKS] A(Pass/Fail): 473/1575 | B: 354/1694 | C: 272/1776 [LOSS Ex1] A: 0.67270 | B: 0.67124 | C: 0.66659 [LOGITS Ex2 A] Mean Abs: 1.605 | Max: 5.521 [LOSS Ex2] A: 0.23059 | B: 0.42807 | C: 0.35144 ** [JOINT LOSS] ** : 1.006878 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009885 | Grad Max: 0.218709 -> Layer: shared_layers.0.bias | Grad Mean: 0.594088 | Grad Max: 2.611068 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.006954 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001348 | Grad Max: 0.001348 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003969 | Grad Max: 0.353712 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074847 | Grad Max: 1.898408 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000818 | Grad Max: 0.019453 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034777 | Grad Max: 0.128960 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000115 | Grad Max: 0.001389 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007610 | Grad Max: 0.016977 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000572 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002312 | Grad Max: 0.005368 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003700 | Grad Max: 0.006536 -> Layer: exit2_layers.12.bias | Grad Mean: 0.063213 | Grad Max: 0.063213 [GRADIENT NORM TOTAL] 11.3504 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.236 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5217529 0.4782471] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.524 | Std: 0.027 [MASKS] A(Pass/Fail): 491/1557 | B: 373/1675 | C: 238/1810 [LOSS Ex1] A: 0.67222 | B: 0.66843 | C: 0.66812 [LOGITS Ex2 A] Mean Abs: 1.546 | Max: 6.228 [LOSS Ex2] A: 0.22304 | B: 0.40475 | C: 0.35399 ** [JOINT LOSS] ** : 0.996847 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006111 | Grad Max: 0.139189 -> Layer: shared_layers.0.bias | Grad Mean: 0.358587 | Grad Max: 1.560079 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001965 | Grad Max: 0.006961 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004946 | Grad Max: 0.004946 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002430 | Grad Max: 0.232498 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045559 | Grad Max: 1.248395 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000497 | Grad Max: 0.012415 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021149 | Grad Max: 0.080397 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000831 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004644 | Grad Max: 0.009689 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000343 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001417 | Grad Max: 0.003333 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002288 | Grad Max: 0.004202 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039222 | Grad Max: 0.039222 [GRADIENT NORM TOTAL] 6.9759 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.385 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59032685 0.40967312] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.029 [MASKS] A(Pass/Fail): 549/1499 | B: 337/1519 | C: 251/1797 [LOSS Ex1] A: 0.66970 | B: 0.67126 | C: 0.66676 [LOGITS Ex2 A] Mean Abs: 1.544 | Max: 5.841 [LOSS Ex2] A: 0.21653 | B: 0.39068 | C: 0.32569 ** [JOINT LOSS] ** : 0.980205 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001806 | Grad Max: 0.044658 -> Layer: shared_layers.0.bias | Grad Mean: 0.132940 | Grad Max: 0.643621 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.008767 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012075 | Grad Max: 0.012075 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000927 | Grad Max: 0.117750 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016787 | Grad Max: 0.667532 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000169 | Grad Max: 0.005898 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007091 | Grad Max: 0.032998 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000377 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001491 | Grad Max: 0.004128 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000174 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000437 | Grad Max: 0.001466 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000600 | Grad Max: 0.001915 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010636 | Grad Max: 0.010636 [GRADIENT NORM TOTAL] 2.7516 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.430 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007908 0.4992092] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 511/1537 | B: 351/1697 | C: 277/1771 [LOSS Ex1] A: 0.67304 | B: 0.67058 | C: 0.66631 [LOGITS Ex2 A] Mean Abs: 1.542 | Max: 5.847 [LOSS Ex2] A: 0.20437 | B: 0.42719 | C: 0.32712 ** [JOINT LOSS] ** : 0.989538 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004948 | Grad Max: 0.126977 -> Layer: shared_layers.0.bias | Grad Mean: 0.319665 | Grad Max: 1.433124 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001989 | Grad Max: 0.007893 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008149 | Grad Max: 0.008149 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.290361 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040743 | Grad Max: 1.643592 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000433 | Grad Max: 0.011482 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018331 | Grad Max: 0.071126 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000706 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004030 | Grad Max: 0.008923 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000326 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001222 | Grad Max: 0.003073 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001925 | Grad Max: 0.003492 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032879 | Grad Max: 0.032879 [GRADIENT NORM TOTAL] 6.5796 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.223 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5813542 0.4186458] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 501/1547 | B: 354/1694 | C: 251/1797 [LOSS Ex1] A: 0.67116 | B: 0.67112 | C: 0.66796 [LOGITS Ex2 A] Mean Abs: 1.546 | Max: 5.915 [LOSS Ex2] A: 0.22988 | B: 0.41475 | C: 0.31904 ** [JOINT LOSS] ** : 0.991303 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003777 | Grad Max: 0.079065 -> Layer: shared_layers.0.bias | Grad Mean: 0.160319 | Grad Max: 0.685577 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001970 | Grad Max: 0.008086 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006813 | Grad Max: 0.006813 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001120 | Grad Max: 0.226405 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020989 | Grad Max: 1.282527 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000209 | Grad Max: 0.005355 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008918 | Grad Max: 0.034701 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000447 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001974 | Grad Max: 0.004767 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000181 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000593 | Grad Max: 0.001634 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000905 | Grad Max: 0.002235 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015620 | Grad Max: 0.015620 [GRADIENT NORM TOTAL] 3.6128 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.294 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5505402 0.4494598] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.028 [MASKS] A(Pass/Fail): 415/1201 | B: 372/1676 | C: 277/1771 [LOSS Ex1] A: 0.67012 | B: 0.66831 | C: 0.66574 [LOGITS Ex2 A] Mean Abs: 1.607 | Max: 5.699 [LOSS Ex2] A: 0.20906 | B: 0.40472 | C: 0.32351 ** [JOINT LOSS] ** : 0.980484 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002753 | Grad Max: 0.087881 -> Layer: shared_layers.0.bias | Grad Mean: 0.231160 | Grad Max: 1.122072 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.008161 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001171 | Grad Max: 0.001171 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001485 | Grad Max: 0.144915 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027642 | Grad Max: 0.819403 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000294 | Grad Max: 0.008847 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012414 | Grad Max: 0.053906 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000527 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002643 | Grad Max: 0.005904 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000210 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000806 | Grad Max: 0.002059 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001223 | Grad Max: 0.002838 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022108 | Grad Max: 0.022108 [GRADIENT NORM TOTAL] 4.5801 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.431 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062857 0.49371427] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.029 [MASKS] A(Pass/Fail): 533/1515 | B: 336/1520 | C: 172/1204 [LOSS Ex1] A: 0.67032 | B: 0.67114 | C: 0.66844 [LOGITS Ex2 A] Mean Abs: 1.592 | Max: 6.032 [LOSS Ex2] A: 0.21979 | B: 0.38447 | C: 0.33849 ** [JOINT LOSS] ** : 0.984215 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004106 | Grad Max: 0.099110 -> Layer: shared_layers.0.bias | Grad Mean: 0.279398 | Grad Max: 1.215695 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001931 | Grad Max: 0.007943 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006125 | Grad Max: 0.006125 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001851 | Grad Max: 0.173503 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034697 | Grad Max: 0.914301 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000384 | Grad Max: 0.011787 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016422 | Grad Max: 0.070407 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000730 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003541 | Grad Max: 0.008033 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000272 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001080 | Grad Max: 0.002610 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001684 | Grad Max: 0.003603 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029826 | Grad Max: 0.029826 [GRADIENT NORM TOTAL] 5.4584 [EPOCH SUMMARY] Train Loss: 0.9900 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9620 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9640 -> New: 0.9620) ############################## EPOCH 57/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.398 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50507283 0.4949272 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.029 [MASKS] A(Pass/Fail): 528/1520 | B: 353/1695 | C: 244/1804 [LOSS Ex1] A: 0.66918 | B: 0.67046 | C: 0.66795 [LOGITS Ex2 A] Mean Abs: 1.558 | Max: 5.676 [LOSS Ex2] A: 0.20707 | B: 0.42018 | C: 0.32352 ** [JOINT LOSS] ** : 0.986121 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001510 | Grad Max: 0.055128 -> Layer: shared_layers.0.bias | Grad Mean: 0.023101 | Grad Max: 0.098332 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.008505 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008072 | Grad Max: 0.008072 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000299 | Grad Max: 0.032802 -> Layer: exit2_layers.0.bias | Grad Mean: 0.004708 | Grad Max: 0.175928 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.003225 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001160 | Grad Max: 0.013012 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000148 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000208 | Grad Max: 0.001353 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000073 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000063 | Grad Max: 0.000407 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000396 | Grad Max: 0.001158 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000553 | Grad Max: 0.000553 [GRADIENT NORM TOTAL] 0.6772 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.410 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5020897 0.4979103] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.029 [MASKS] A(Pass/Fail): 513/1535 | B: 355/1693 | C: 261/1787 [LOSS Ex1] A: 0.66826 | B: 0.67100 | C: 0.66729 [LOGITS Ex2 A] Mean Abs: 1.523 | Max: 8.216 [LOSS Ex2] A: 0.23214 | B: 0.41263 | C: 0.34119 ** [JOINT LOSS] ** : 0.997506 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003054 | Grad Max: 0.080178 -> Layer: shared_layers.0.bias | Grad Mean: 0.213725 | Grad Max: 0.999366 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.008758 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009759 | Grad Max: 0.009759 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001400 | Grad Max: 0.148110 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026001 | Grad Max: 0.812616 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.009006 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012091 | Grad Max: 0.053587 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000534 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002615 | Grad Max: 0.006006 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000213 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000786 | Grad Max: 0.001929 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001265 | Grad Max: 0.002409 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021720 | Grad Max: 0.021720 [GRADIENT NORM TOTAL] 4.1027 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.346 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50510126 0.49489874] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.028 [MASKS] A(Pass/Fail): 483/1565 | B: 373/1675 | C: 282/1766 [LOSS Ex1] A: 0.67250 | B: 0.66817 | C: 0.66511 [LOGITS Ex2 A] Mean Abs: 1.517 | Max: 6.282 [LOSS Ex2] A: 0.21144 | B: 0.39717 | C: 0.32280 ** [JOINT LOSS] ** : 0.979061 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.044460 -> Layer: shared_layers.0.bias | Grad Mean: 0.118342 | Grad Max: 0.537693 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001998 | Grad Max: 0.006754 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002041 | Grad Max: 0.002041 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000854 | Grad Max: 0.143090 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015346 | Grad Max: 0.783975 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.005519 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007247 | Grad Max: 0.031704 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000441 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001564 | Grad Max: 0.004329 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000161 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000464 | Grad Max: 0.001336 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000761 | Grad Max: 0.002519 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012161 | Grad Max: 0.012161 [GRADIENT NORM TOTAL] 2.4522 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.059 | Max: 0.237 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52197105 0.47802892] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.524 | Std: 0.028 [MASKS] A(Pass/Fail): 494/1554 | B: 336/1520 | C: 288/1760 [LOSS Ex1] A: 0.67202 | B: 0.67099 | C: 0.66349 [LOGITS Ex2 A] Mean Abs: 1.537 | Max: 6.187 [LOSS Ex2] A: 0.23210 | B: 0.39229 | C: 0.31940 ** [JOINT LOSS] ** : 0.983434 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006207 | Grad Max: 0.176618 -> Layer: shared_layers.0.bias | Grad Mean: 0.301464 | Grad Max: 1.278928 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.008240 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005223 | Grad Max: 0.005223 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.193152 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039693 | Grad Max: 1.060018 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000425 | Grad Max: 0.010813 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018050 | Grad Max: 0.068042 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000741 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004021 | Grad Max: 0.008733 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000296 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001241 | Grad Max: 0.002954 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002045 | Grad Max: 0.004403 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035462 | Grad Max: 0.035462 [GRADIENT NORM TOTAL] 5.8858 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.388 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5915564 0.4084436] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.029 [MASKS] A(Pass/Fail): 555/1493 | B: 353/1695 | C: 292/1756 [LOSS Ex1] A: 0.66946 | B: 0.67031 | C: 0.66454 [LOGITS Ex2 A] Mean Abs: 1.581 | Max: 6.431 [LOSS Ex2] A: 0.21854 | B: 0.41860 | C: 0.31809 ** [JOINT LOSS] ** : 0.986514 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007273 | Grad Max: 0.225663 -> Layer: shared_layers.0.bias | Grad Mean: 0.370615 | Grad Max: 1.641627 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002075 | Grad Max: 0.008308 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006946 | Grad Max: 0.006946 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002582 | Grad Max: 0.192960 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047801 | Grad Max: 1.043747 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000516 | Grad Max: 0.013547 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021861 | Grad Max: 0.081119 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000927 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004866 | Grad Max: 0.010851 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000361 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001493 | Grad Max: 0.003707 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002417 | Grad Max: 0.004836 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041620 | Grad Max: 0.041620 [GRADIENT NORM TOTAL] 7.0986 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.433 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50067407 0.4993259 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 515/1533 | B: 356/1692 | C: 270/1778 [LOSS Ex1] A: 0.67283 | B: 0.67085 | C: 0.66478 [LOGITS Ex2 A] Mean Abs: 1.563 | Max: 5.534 [LOSS Ex2] A: 0.20259 | B: 0.40868 | C: 0.33209 ** [JOINT LOSS] ** : 0.983936 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001596 | Grad Max: 0.041583 -> Layer: shared_layers.0.bias | Grad Mean: 0.079974 | Grad Max: 0.424801 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001936 | Grad Max: 0.007319 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003190 | Grad Max: 0.003190 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000648 | Grad Max: 0.097449 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011618 | Grad Max: 0.530799 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.004011 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004646 | Grad Max: 0.022299 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000311 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000982 | Grad Max: 0.003090 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000126 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000299 | Grad Max: 0.001113 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000506 | Grad Max: 0.002069 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007941 | Grad Max: 0.007941 [GRADIENT NORM TOTAL] 1.8511 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.225 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5823173 0.41768274] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.029 [MASKS] A(Pass/Fail): 512/1536 | B: 374/1674 | C: 264/1784 [LOSS Ex1] A: 0.67093 | B: 0.66801 | C: 0.66585 [LOGITS Ex2 A] Mean Abs: 1.537 | Max: 5.647 [LOSS Ex2] A: 0.22342 | B: 0.41257 | C: 0.34938 ** [JOINT LOSS] ** : 0.996716 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006182 | Grad Max: 0.131971 -> Layer: shared_layers.0.bias | Grad Mean: 0.405528 | Grad Max: 1.802816 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.007100 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004461 | Grad Max: 0.004461 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002674 | Grad Max: 0.241907 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050536 | Grad Max: 1.367549 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000562 | Grad Max: 0.013880 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024016 | Grad Max: 0.089027 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000905 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005243 | Grad Max: 0.010781 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000395 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001602 | Grad Max: 0.003724 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002604 | Grad Max: 0.005096 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044114 | Grad Max: 0.044114 [GRADIENT NORM TOTAL] 7.7434 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.298 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5511262 0.44887382] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.029 [MASKS] A(Pass/Fail): 423/1193 | B: 343/1513 | C: 265/1783 [LOSS Ex1] A: 0.66987 | B: 0.67085 | C: 0.66622 [LOGITS Ex2 A] Mean Abs: 1.541 | Max: 5.834 [LOSS Ex2] A: 0.22006 | B: 0.41950 | C: 0.34423 ** [JOINT LOSS] ** : 0.996913 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008900 | Grad Max: 0.182869 -> Layer: shared_layers.0.bias | Grad Mean: 0.550196 | Grad Max: 2.393087 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.008227 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005327 | Grad Max: 0.005327 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003644 | Grad Max: 0.316444 -> Layer: exit2_layers.0.bias | Grad Mean: 0.068367 | Grad Max: 1.767975 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000746 | Grad Max: 0.019130 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031796 | Grad Max: 0.120224 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001236 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006943 | Grad Max: 0.015014 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000536 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002112 | Grad Max: 0.005057 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003424 | Grad Max: 0.006191 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057931 | Grad Max: 0.057931 [GRADIENT NORM TOTAL] 10.6161 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.434 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062187 0.49378133] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.029 [MASKS] A(Pass/Fail): 541/1507 | B: 355/1693 | C: 251/1797 [LOSS Ex1] A: 0.67007 | B: 0.67016 | C: 0.66886 [LOGITS Ex2 A] Mean Abs: 1.554 | Max: 7.000 [LOSS Ex2] A: 0.21105 | B: 0.42887 | C: 0.34138 ** [JOINT LOSS] ** : 0.996796 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006332 | Grad Max: 0.142905 -> Layer: shared_layers.0.bias | Grad Mean: 0.419623 | Grad Max: 1.840796 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001994 | Grad Max: 0.008158 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007879 | Grad Max: 0.007879 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002798 | Grad Max: 0.310549 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052669 | Grad Max: 1.755409 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000571 | Grad Max: 0.015516 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024488 | Grad Max: 0.092197 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000910 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005357 | Grad Max: 0.011234 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000409 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001625 | Grad Max: 0.003953 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002594 | Grad Max: 0.004514 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044090 | Grad Max: 0.044090 [GRADIENT NORM TOTAL] 8.2847 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.402 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50512266 0.4948774 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.029 [MASKS] A(Pass/Fail): 533/1515 | B: 358/1690 | C: 258/1790 [LOSS Ex1] A: 0.66892 | B: 0.67070 | C: 0.66720 [LOGITS Ex2 A] Mean Abs: 1.588 | Max: 5.663 [LOSS Ex2] A: 0.21516 | B: 0.40303 | C: 0.35304 ** [JOINT LOSS] ** : 0.992684 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002509 | Grad Max: 0.062150 -> Layer: shared_layers.0.bias | Grad Mean: 0.130988 | Grad Max: 0.548308 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002107 | Grad Max: 0.008672 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011959 | Grad Max: 0.011959 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000943 | Grad Max: 0.088968 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017143 | Grad Max: 0.486428 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.006005 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007408 | Grad Max: 0.036569 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000442 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001635 | Grad Max: 0.004901 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000167 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000492 | Grad Max: 0.001266 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000656 | Grad Max: 0.001898 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012672 | Grad Max: 0.012672 [GRADIENT NORM TOTAL] 2.6556 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.413 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50201386 0.49798617] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.029 [MASKS] A(Pass/Fail): 522/1526 | B: 376/1672 | C: 245/1803 [LOSS Ex1] A: 0.66801 | B: 0.66787 | C: 0.66700 [LOGITS Ex2 A] Mean Abs: 1.586 | Max: 5.941 [LOSS Ex2] A: 0.23297 | B: 0.39941 | C: 0.33511 ** [JOINT LOSS] ** : 0.990121 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005935 | Grad Max: 0.188742 -> Layer: shared_layers.0.bias | Grad Mean: 0.287944 | Grad Max: 1.285261 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.008939 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007975 | Grad Max: 0.007975 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002038 | Grad Max: 0.170658 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037646 | Grad Max: 0.892579 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.010202 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017302 | Grad Max: 0.064845 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000782 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003845 | Grad Max: 0.008599 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000322 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001190 | Grad Max: 0.003010 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002008 | Grad Max: 0.004341 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033831 | Grad Max: 0.033831 [GRADIENT NORM TOTAL] 5.6126 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.349 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5050191 0.4949809] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.524 | Std: 0.028 [MASKS] A(Pass/Fail): 494/1554 | B: 343/1513 | C: 285/1763 [LOSS Ex1] A: 0.67229 | B: 0.67071 | C: 0.66487 [LOGITS Ex2 A] Mean Abs: 1.552 | Max: 5.281 [LOSS Ex2] A: 0.20441 | B: 0.38768 | C: 0.31184 ** [JOINT LOSS] ** : 0.970599 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001849 | Grad Max: 0.033679 -> Layer: shared_layers.0.bias | Grad Mean: 0.047245 | Grad Max: 0.226833 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001895 | Grad Max: 0.006587 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004682 | Grad Max: 0.004682 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000377 | Grad Max: 0.138343 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006309 | Grad Max: 0.773839 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.002979 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001545 | Grad Max: 0.016513 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000159 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000285 | Grad Max: 0.001573 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000083 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000091 | Grad Max: 0.000604 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000616 | Grad Max: 0.001587 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002449 | Grad Max: 0.002449 [GRADIENT NORM TOTAL] 1.4744 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.239 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5221809 0.4778191] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.524 | Std: 0.028 [MASKS] A(Pass/Fail): 500/1548 | B: 357/1691 | C: 247/1801 [LOSS Ex1] A: 0.67181 | B: 0.67002 | C: 0.66748 [LOGITS Ex2 A] Mean Abs: 1.492 | Max: 6.098 [LOSS Ex2] A: 0.21761 | B: 0.41682 | C: 0.34140 ** [JOINT LOSS] ** : 0.995042 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003260 | Grad Max: 0.082620 -> Layer: shared_layers.0.bias | Grad Mean: 0.240549 | Grad Max: 1.091910 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001887 | Grad Max: 0.007447 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000359 | Grad Max: 0.000359 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001562 | Grad Max: 0.161894 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029632 | Grad Max: 0.908848 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000335 | Grad Max: 0.009806 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014358 | Grad Max: 0.060036 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000565 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003089 | Grad Max: 0.006661 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000249 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000932 | Grad Max: 0.002433 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001560 | Grad Max: 0.003268 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025519 | Grad Max: 0.025519 [GRADIENT NORM TOTAL] 4.6262 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.392 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59275395 0.40724608] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.029 [MASKS] A(Pass/Fail): 561/1487 | B: 360/1688 | C: 180/1196 [LOSS Ex1] A: 0.66922 | B: 0.67055 | C: 0.66484 [LOGITS Ex2 A] Mean Abs: 1.567 | Max: 5.673 [LOSS Ex2] A: 0.21282 | B: 0.41856 | C: 0.31154 ** [JOINT LOSS] ** : 0.982515 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002538 | Grad Max: 0.053101 -> Layer: shared_layers.0.bias | Grad Mean: 0.099945 | Grad Max: 0.430139 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.009164 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014740 | Grad Max: 0.014740 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000674 | Grad Max: 0.096188 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012289 | Grad Max: 0.491487 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000138 | Grad Max: 0.004602 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005884 | Grad Max: 0.029676 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000363 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001273 | Grad Max: 0.003375 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000143 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000383 | Grad Max: 0.001088 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000735 | Grad Max: 0.002044 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011220 | Grad Max: 0.011220 [GRADIENT NORM TOTAL] 1.9474 [EPOCH SUMMARY] Train Loss: 0.9884 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9651 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 58/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.437 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50061876 0.49938124] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.029 [MASKS] A(Pass/Fail): 522/1526 | B: 376/1672 | C: 268/1780 [LOSS Ex1] A: 0.67261 | B: 0.66770 | C: 0.66579 [LOGITS Ex2 A] Mean Abs: 1.592 | Max: 5.415 [LOSS Ex2] A: 0.21236 | B: 0.40148 | C: 0.33980 ** [JOINT LOSS] ** : 0.986582 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003416 | Grad Max: 0.119319 -> Layer: shared_layers.0.bias | Grad Mean: 0.324266 | Grad Max: 1.454116 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001992 | Grad Max: 0.007294 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004307 | Grad Max: 0.004307 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.188603 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039823 | Grad Max: 1.061224 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000442 | Grad Max: 0.013082 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019190 | Grad Max: 0.077629 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000868 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004147 | Grad Max: 0.009618 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000329 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001241 | Grad Max: 0.003035 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001838 | Grad Max: 0.003663 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033190 | Grad Max: 0.033190 [GRADIENT NORM TOTAL] 6.3859 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.228 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5832424 0.41675755] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.029 [MASKS] A(Pass/Fail): 514/1534 | B: 345/1511 | C: 269/1779 [LOSS Ex1] A: 0.67069 | B: 0.67055 | C: 0.66525 [LOGITS Ex2 A] Mean Abs: 1.610 | Max: 6.016 [LOSS Ex2] A: 0.22422 | B: 0.39694 | C: 0.31809 ** [JOINT LOSS] ** : 0.981913 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004750 | Grad Max: 0.113877 -> Layer: shared_layers.0.bias | Grad Mean: 0.310633 | Grad Max: 1.329426 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001958 | Grad Max: 0.007948 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002724 | Grad Max: 0.002724 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.205885 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039947 | Grad Max: 1.155029 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000441 | Grad Max: 0.012274 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019038 | Grad Max: 0.076296 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000755 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004138 | Grad Max: 0.009094 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000287 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001250 | Grad Max: 0.002826 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001978 | Grad Max: 0.004070 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034092 | Grad Max: 0.034092 [GRADIENT NORM TOTAL] 6.1646 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.302 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55172366 0.44827637] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.029 [MASKS] A(Pass/Fail): 427/1189 | B: 359/1689 | C: 292/1756 [LOSS Ex1] A: 0.66964 | B: 0.66985 | C: 0.66395 [LOGITS Ex2 A] Mean Abs: 1.614 | Max: 6.439 [LOSS Ex2] A: 0.20043 | B: 0.41482 | C: 0.31626 ** [JOINT LOSS] ** : 0.978315 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001445 | Grad Max: 0.037761 -> Layer: shared_layers.0.bias | Grad Mean: 0.058765 | Grad Max: 0.380012 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.007899 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002546 | Grad Max: 0.002546 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000413 | Grad Max: 0.096327 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007008 | Grad Max: 0.549876 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000059 | Grad Max: 0.002705 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002245 | Grad Max: 0.015795 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000190 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000487 | Grad Max: 0.002366 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000089 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000150 | Grad Max: 0.000686 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000429 | Grad Max: 0.001582 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003903 | Grad Max: 0.003903 [GRADIENT NORM TOTAL] 1.2811 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.439 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50619185 0.49380815] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 551/1497 | B: 361/1687 | C: 257/1791 [LOSS Ex1] A: 0.66982 | B: 0.67038 | C: 0.66727 [LOGITS Ex2 A] Mean Abs: 1.590 | Max: 6.128 [LOSS Ex2] A: 0.20453 | B: 0.41788 | C: 0.33501 ** [JOINT LOSS] ** : 0.988293 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005946 | Grad Max: 0.141598 -> Layer: shared_layers.0.bias | Grad Mean: 0.362966 | Grad Max: 1.643561 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001948 | Grad Max: 0.007760 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004579 | Grad Max: 0.004579 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002482 | Grad Max: 0.206940 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046920 | Grad Max: 1.170284 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000520 | Grad Max: 0.013182 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022292 | Grad Max: 0.086156 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000873 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004889 | Grad Max: 0.010675 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000366 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001483 | Grad Max: 0.003508 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002465 | Grad Max: 0.004219 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041204 | Grad Max: 0.041204 [GRADIENT NORM TOTAL] 7.0253 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.406 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50516295 0.49483705] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.029 [MASKS] A(Pass/Fail): 540/1508 | B: 379/1669 | C: 264/1784 [LOSS Ex1] A: 0.66864 | B: 0.66753 | C: 0.66703 [LOGITS Ex2 A] Mean Abs: 1.571 | Max: 5.462 [LOSS Ex2] A: 0.19847 | B: 0.40968 | C: 0.33504 ** [JOINT LOSS] ** : 0.982131 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005004 | Grad Max: 0.122232 -> Layer: shared_layers.0.bias | Grad Mean: 0.351743 | Grad Max: 1.597424 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.009098 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014256 | Grad Max: 0.014256 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002329 | Grad Max: 0.204194 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043880 | Grad Max: 1.162877 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000482 | Grad Max: 0.013218 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020734 | Grad Max: 0.082758 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000886 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004534 | Grad Max: 0.009959 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000354 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001373 | Grad Max: 0.003410 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002224 | Grad Max: 0.004201 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037215 | Grad Max: 0.037215 [GRADIENT NORM TOTAL] 6.7174 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.417 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5019931 0.4980069] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.029 [MASKS] A(Pass/Fail): 528/1520 | B: 348/1508 | C: 278/1770 [LOSS Ex1] A: 0.66773 | B: 0.67039 | C: 0.66371 [LOGITS Ex2 A] Mean Abs: 1.575 | Max: 6.758 [LOSS Ex2] A: 0.22272 | B: 0.38611 | C: 0.32995 ** [JOINT LOSS] ** : 0.980203 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003300 | Grad Max: 0.112196 -> Layer: shared_layers.0.bias | Grad Mean: 0.087179 | Grad Max: 0.373124 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.009346 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013761 | Grad Max: 0.013761 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000706 | Grad Max: 0.122353 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011370 | Grad Max: 0.689542 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000094 | Grad Max: 0.004435 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003444 | Grad Max: 0.024544 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000244 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000604 | Grad Max: 0.002771 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000097 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000163 | Grad Max: 0.000772 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000293 | Grad Max: 0.001122 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003828 | Grad Max: 0.003828 [GRADIENT NORM TOTAL] 1.8556 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.352 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5050011 0.4949989] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.029 [MASKS] A(Pass/Fail): 505/1543 | B: 359/1689 | C: 254/1794 [LOSS Ex1] A: 0.67204 | B: 0.66968 | C: 0.66678 [LOGITS Ex2 A] Mean Abs: 1.598 | Max: 5.595 [LOSS Ex2] A: 0.20967 | B: 0.42350 | C: 0.33856 ** [JOINT LOSS] ** : 0.993411 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004415 | Grad Max: 0.108618 -> Layer: shared_layers.0.bias | Grad Mean: 0.323027 | Grad Max: 1.420796 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001897 | Grad Max: 0.006998 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002453 | Grad Max: 0.002453 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.174121 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039217 | Grad Max: 0.982516 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000427 | Grad Max: 0.010742 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018382 | Grad Max: 0.075636 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000704 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003990 | Grad Max: 0.008808 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000327 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001202 | Grad Max: 0.002880 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001909 | Grad Max: 0.003746 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032802 | Grad Max: 0.032802 [GRADIENT NORM TOTAL] 6.2056 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.060 | Max: 0.241 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5224113 0.47758865] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.028 [MASKS] A(Pass/Fail): 503/1545 | B: 363/1685 | C: 275/1773 [LOSS Ex1] A: 0.67156 | B: 0.67021 | C: 0.66455 [LOGITS Ex2 A] Mean Abs: 1.585 | Max: 6.081 [LOSS Ex2] A: 0.22705 | B: 0.42560 | C: 0.30644 ** [JOINT LOSS] ** : 0.988471 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005743 | Grad Max: 0.143655 -> Layer: shared_layers.0.bias | Grad Mean: 0.397647 | Grad Max: 1.976695 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001937 | Grad Max: 0.006825 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006430 | Grad Max: 0.006430 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002606 | Grad Max: 0.257885 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049436 | Grad Max: 1.412772 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.014849 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023211 | Grad Max: 0.094558 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000960 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005040 | Grad Max: 0.010857 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000369 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001515 | Grad Max: 0.003584 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002401 | Grad Max: 0.004539 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040949 | Grad Max: 0.040949 [GRADIENT NORM TOTAL] 7.8313 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.396 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59413296 0.4058671 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.030 [MASKS] A(Pass/Fail): 571/1477 | B: 381/1667 | C: 262/1786 [LOSS Ex1] A: 0.66894 | B: 0.66736 | C: 0.66605 [LOGITS Ex2 A] Mean Abs: 1.609 | Max: 6.271 [LOSS Ex2] A: 0.20450 | B: 0.38862 | C: 0.33065 ** [JOINT LOSS] ** : 0.975372 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002617 | Grad Max: 0.108151 -> Layer: shared_layers.0.bias | Grad Mean: 0.229687 | Grad Max: 1.209180 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.008398 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007608 | Grad Max: 0.007608 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001540 | Grad Max: 0.178276 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028897 | Grad Max: 1.004770 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.009319 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013497 | Grad Max: 0.058976 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000573 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002896 | Grad Max: 0.006341 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000226 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000877 | Grad Max: 0.002174 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001393 | Grad Max: 0.003369 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024407 | Grad Max: 0.024407 [GRADIENT NORM TOTAL] 4.7253 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.442 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50058275 0.49941725] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.029 [MASKS] A(Pass/Fail): 529/1519 | B: 350/1506 | C: 261/1787 [LOSS Ex1] A: 0.67236 | B: 0.67022 | C: 0.66770 [LOGITS Ex2 A] Mean Abs: 1.583 | Max: 5.645 [LOSS Ex2] A: 0.21779 | B: 0.39808 | C: 0.34802 ** [JOINT LOSS] ** : 0.991393 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007686 | Grad Max: 0.235881 -> Layer: shared_layers.0.bias | Grad Mean: 0.333056 | Grad Max: 1.379103 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001870 | Grad Max: 0.007243 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004975 | Grad Max: 0.004975 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002355 | Grad Max: 0.197010 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043053 | Grad Max: 0.992093 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000466 | Grad Max: 0.010765 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019676 | Grad Max: 0.070452 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000742 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004400 | Grad Max: 0.009282 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000367 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001340 | Grad Max: 0.003304 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002252 | Grad Max: 0.004526 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037142 | Grad Max: 0.037142 [GRADIENT NORM TOTAL] 6.2214 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.231 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5843062 0.41569382] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.029 [MASKS] A(Pass/Fail): 521/1527 | B: 363/1685 | C: 268/1780 [LOSS Ex1] A: 0.67043 | B: 0.66952 | C: 0.66469 [LOGITS Ex2 A] Mean Abs: 1.567 | Max: 5.932 [LOSS Ex2] A: 0.22198 | B: 0.43160 | C: 0.33056 ** [JOINT LOSS] ** : 0.996258 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008717 | Grad Max: 0.227425 -> Layer: shared_layers.0.bias | Grad Mean: 0.440311 | Grad Max: 1.929415 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.007960 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005873 | Grad Max: 0.005873 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003094 | Grad Max: 0.250056 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057741 | Grad Max: 1.372991 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000633 | Grad Max: 0.015469 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027010 | Grad Max: 0.100097 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001030 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005945 | Grad Max: 0.012534 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000525 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001799 | Grad Max: 0.004515 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002943 | Grad Max: 0.005314 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048776 | Grad Max: 0.048776 [GRADIENT NORM TOTAL] 8.4282 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.306 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5523942 0.44760573] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 430/1186 | B: 363/1685 | C: 243/1805 [LOSS Ex1] A: 0.66937 | B: 0.67006 | C: 0.66677 [LOGITS Ex2 A] Mean Abs: 1.615 | Max: 6.129 [LOSS Ex2] A: 0.20906 | B: 0.41120 | C: 0.31557 ** [JOINT LOSS] ** : 0.980680 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004661 | Grad Max: 0.132481 -> Layer: shared_layers.0.bias | Grad Mean: 0.232357 | Grad Max: 1.119301 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001981 | Grad Max: 0.008278 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003309 | Grad Max: 0.003309 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001631 | Grad Max: 0.128200 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029854 | Grad Max: 0.691799 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000340 | Grad Max: 0.010064 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014467 | Grad Max: 0.059901 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000596 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003217 | Grad Max: 0.007114 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000274 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000976 | Grad Max: 0.002440 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001704 | Grad Max: 0.003343 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027219 | Grad Max: 0.027219 [GRADIENT NORM TOTAL] 4.4225 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.443 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50616175 0.49383825] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 561/1487 | B: 381/1667 | C: 292/1756 [LOSS Ex1] A: 0.66956 | B: 0.66720 | C: 0.66391 [LOGITS Ex2 A] Mean Abs: 1.643 | Max: 7.541 [LOSS Ex2] A: 0.22195 | B: 0.40151 | C: 0.32853 ** [JOINT LOSS] ** : 0.984218 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004078 | Grad Max: 0.118318 -> Layer: shared_layers.0.bias | Grad Mean: 0.350693 | Grad Max: 1.520706 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.008064 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004953 | Grad Max: 0.004953 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.214989 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043049 | Grad Max: 1.209262 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000467 | Grad Max: 0.013453 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020135 | Grad Max: 0.083762 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000768 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004306 | Grad Max: 0.009069 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000338 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001299 | Grad Max: 0.003262 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002131 | Grad Max: 0.004237 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035944 | Grad Max: 0.035944 [GRADIENT NORM TOTAL] 6.9845 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.409 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5052999 0.49470013] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 548/1500 | B: 351/1505 | C: 160/1216 [LOSS Ex1] A: 0.66838 | B: 0.67008 | C: 0.66642 [LOGITS Ex2 A] Mean Abs: 1.645 | Max: 5.342 [LOSS Ex2] A: 0.22865 | B: 0.40543 | C: 0.32851 ** [JOINT LOSS] ** : 0.989156 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008463 | Grad Max: 0.219643 -> Layer: shared_layers.0.bias | Grad Mean: 0.480277 | Grad Max: 2.079160 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002089 | Grad Max: 0.008587 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009592 | Grad Max: 0.009592 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003212 | Grad Max: 0.321003 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060828 | Grad Max: 1.702963 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000656 | Grad Max: 0.016325 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028263 | Grad Max: 0.109033 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001144 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006157 | Grad Max: 0.013282 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000495 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001859 | Grad Max: 0.004598 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003005 | Grad Max: 0.005546 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050432 | Grad Max: 0.050432 [GRADIENT NORM TOTAL] 9.3451 [EPOCH SUMMARY] Train Loss: 0.9855 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9659 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 59/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.420 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50190616 0.4980938 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 535/1513 | B: 363/1685 | C: 269/1779 [LOSS Ex1] A: 0.66747 | B: 0.66938 | C: 0.66409 [LOGITS Ex2 A] Mean Abs: 1.639 | Max: 5.914 [LOSS Ex2] A: 0.23537 | B: 0.41542 | C: 0.30711 ** [JOINT LOSS] ** : 0.986281 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008306 | Grad Max: 0.248772 -> Layer: shared_layers.0.bias | Grad Mean: 0.365890 | Grad Max: 1.569164 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.008916 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004936 | Grad Max: 0.004936 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002577 | Grad Max: 0.237593 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048303 | Grad Max: 1.245499 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000526 | Grad Max: 0.014218 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022499 | Grad Max: 0.086304 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000935 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004980 | Grad Max: 0.010682 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000384 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001508 | Grad Max: 0.003671 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002399 | Grad Max: 0.004367 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040705 | Grad Max: 0.040705 [GRADIENT NORM TOTAL] 7.0861 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.355 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5049357 0.49506432] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.029 [MASKS] A(Pass/Fail): 509/1539 | B: 363/1685 | C: 261/1787 [LOSS Ex1] A: 0.67184 | B: 0.66993 | C: 0.66580 [LOGITS Ex2 A] Mean Abs: 1.572 | Max: 5.329 [LOSS Ex2] A: 0.20829 | B: 0.40754 | C: 0.33243 ** [JOINT LOSS] ** : 0.985278 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001981 | Grad Max: 0.050488 -> Layer: shared_layers.0.bias | Grad Mean: 0.085522 | Grad Max: 0.456120 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.006730 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002700 | Grad Max: 0.002700 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000673 | Grad Max: 0.075989 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012387 | Grad Max: 0.429044 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000139 | Grad Max: 0.004557 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005895 | Grad Max: 0.031890 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000301 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001278 | Grad Max: 0.003270 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000134 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000382 | Grad Max: 0.001127 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000701 | Grad Max: 0.001921 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010686 | Grad Max: 0.010686 [GRADIENT NORM TOTAL] 1.8069 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.061 | Max: 0.243 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52264535 0.47735465] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.029 [MASKS] A(Pass/Fail): 510/1538 | B: 382/1666 | C: 256/1792 [LOSS Ex1] A: 0.67136 | B: 0.66706 | C: 0.66618 [LOGITS Ex2 A] Mean Abs: 1.541 | Max: 5.820 [LOSS Ex2] A: 0.21199 | B: 0.39016 | C: 0.33606 ** [JOINT LOSS] ** : 0.980937 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002405 | Grad Max: 0.073472 -> Layer: shared_layers.0.bias | Grad Mean: 0.187931 | Grad Max: 0.914569 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002018 | Grad Max: 0.007719 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003084 | Grad Max: 0.003084 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001217 | Grad Max: 0.111801 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022120 | Grad Max: 0.625836 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000246 | Grad Max: 0.008410 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010627 | Grad Max: 0.051773 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000424 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002250 | Grad Max: 0.005295 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000179 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000672 | Grad Max: 0.001748 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001038 | Grad Max: 0.002319 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017956 | Grad Max: 0.017956 [GRADIENT NORM TOTAL] 3.6736 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.400 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5952577 0.40474236] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.030 [MASKS] A(Pass/Fail): 573/1475 | B: 352/1504 | C: 281/1767 [LOSS Ex1] A: 0.66870 | B: 0.66994 | C: 0.66438 [LOGITS Ex2 A] Mean Abs: 1.602 | Max: 5.578 [LOSS Ex2] A: 0.19835 | B: 0.38906 | C: 0.32299 ** [JOINT LOSS] ** : 0.971143 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002907 | Grad Max: 0.076800 -> Layer: shared_layers.0.bias | Grad Mean: 0.050677 | Grad Max: 0.255436 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.009016 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015323 | Grad Max: 0.015323 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000516 | Grad Max: 0.124013 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008983 | Grad Max: 0.701592 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000083 | Grad Max: 0.003112 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003188 | Grad Max: 0.019424 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000252 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000717 | Grad Max: 0.002782 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000098 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000223 | Grad Max: 0.000848 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000463 | Grad Max: 0.001853 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006674 | Grad Max: 0.006674 [GRADIENT NORM TOTAL] 1.5243 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.446 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50051963 0.49948034] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 534/1514 | B: 365/1683 | C: 257/1791 [LOSS Ex1] A: 0.67216 | B: 0.66924 | C: 0.66660 [LOGITS Ex2 A] Mean Abs: 1.604 | Max: 6.002 [LOSS Ex2] A: 0.20672 | B: 0.40808 | C: 0.32995 ** [JOINT LOSS] ** : 0.984250 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002512 | Grad Max: 0.074480 -> Layer: shared_layers.0.bias | Grad Mean: 0.034159 | Grad Max: 0.150082 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001969 | Grad Max: 0.007651 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007910 | Grad Max: 0.007910 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000390 | Grad Max: 0.092383 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006026 | Grad Max: 0.516520 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.002230 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001503 | Grad Max: 0.012848 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000210 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000272 | Grad Max: 0.001832 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000083 | Grad Max: 0.000436 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000366 | Grad Max: 0.000970 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001857 | Grad Max: 0.001857 [GRADIENT NORM TOTAL] 1.0655 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.234 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.585241 0.41475895] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 525/1523 | B: 365/1683 | C: 253/1795 [LOSS Ex1] A: 0.67020 | B: 0.66976 | C: 0.66679 [LOGITS Ex2 A] Mean Abs: 1.580 | Max: 5.568 [LOSS Ex2] A: 0.21903 | B: 0.41480 | C: 0.32897 ** [JOINT LOSS] ** : 0.989848 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003232 | Grad Max: 0.086449 -> Layer: shared_layers.0.bias | Grad Mean: 0.091592 | Grad Max: 0.409298 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001921 | Grad Max: 0.007413 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001553 | Grad Max: 0.001553 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000664 | Grad Max: 0.106401 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012166 | Grad Max: 0.560784 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.004407 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005329 | Grad Max: 0.028301 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000292 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001207 | Grad Max: 0.003171 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000157 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000375 | Grad Max: 0.001137 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000766 | Grad Max: 0.001867 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011202 | Grad Max: 0.011202 [GRADIENT NORM TOTAL] 1.8812 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.310 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55303514 0.44696486] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.030 [MASKS] A(Pass/Fail): 437/1179 | B: 384/1664 | C: 272/1776 [LOSS Ex1] A: 0.66913 | B: 0.66686 | C: 0.66351 [LOGITS Ex2 A] Mean Abs: 1.647 | Max: 5.619 [LOSS Ex2] A: 0.20867 | B: 0.38480 | C: 0.31912 ** [JOINT LOSS] ** : 0.970699 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001753 | Grad Max: 0.039900 -> Layer: shared_layers.0.bias | Grad Mean: 0.117040 | Grad Max: 0.470564 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.007548 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002883 | Grad Max: 0.002883 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000812 | Grad Max: 0.104062 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014935 | Grad Max: 0.587553 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000143 | Grad Max: 0.006287 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006162 | Grad Max: 0.031284 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000342 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001347 | Grad Max: 0.003442 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000131 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000408 | Grad Max: 0.001157 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000518 | Grad Max: 0.001881 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010381 | Grad Max: 0.010381 [GRADIENT NORM TOTAL] 2.4788 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.448 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50608456 0.4939154 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.030 [MASKS] A(Pass/Fail): 563/1485 | B: 352/1504 | C: 272/1776 [LOSS Ex1] A: 0.66930 | B: 0.66973 | C: 0.66601 [LOGITS Ex2 A] Mean Abs: 1.635 | Max: 5.615 [LOSS Ex2] A: 0.21004 | B: 0.38352 | C: 0.31689 ** [JOINT LOSS] ** : 0.971829 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002779 | Grad Max: 0.071800 -> Layer: shared_layers.0.bias | Grad Mean: 0.097732 | Grad Max: 0.350975 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.008651 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011567 | Grad Max: 0.011567 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000733 | Grad Max: 0.066162 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013157 | Grad Max: 0.371125 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000136 | Grad Max: 0.004642 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005736 | Grad Max: 0.027772 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000374 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001273 | Grad Max: 0.003523 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000151 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000389 | Grad Max: 0.001303 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000597 | Grad Max: 0.002140 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010821 | Grad Max: 0.010821 [GRADIENT NORM TOTAL] 1.9413 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.411 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.505437 0.494563] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 552/1496 | B: 365/1683 | C: 263/1785 [LOSS Ex1] A: 0.66808 | B: 0.66901 | C: 0.66487 [LOGITS Ex2 A] Mean Abs: 1.598 | Max: 6.199 [LOSS Ex2] A: 0.21230 | B: 0.42833 | C: 0.32712 ** [JOINT LOSS] ** : 0.989904 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003177 | Grad Max: 0.106718 -> Layer: shared_layers.0.bias | Grad Mean: 0.285316 | Grad Max: 1.381896 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.008767 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009330 | Grad Max: 0.009330 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001971 | Grad Max: 0.274313 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036700 | Grad Max: 1.551086 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.010791 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016006 | Grad Max: 0.071640 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000611 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003516 | Grad Max: 0.007412 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000285 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001058 | Grad Max: 0.002865 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001686 | Grad Max: 0.003161 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028322 | Grad Max: 0.028322 [GRADIENT NORM TOTAL] 6.0420 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.423 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50180054 0.4981994 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 540/1508 | B: 367/1681 | C: 276/1772 [LOSS Ex1] A: 0.66716 | B: 0.66953 | C: 0.66375 [LOGITS Ex2 A] Mean Abs: 1.588 | Max: 6.804 [LOSS Ex2] A: 0.22209 | B: 0.42532 | C: 0.32662 ** [JOINT LOSS] ** : 0.991490 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004710 | Grad Max: 0.127031 -> Layer: shared_layers.0.bias | Grad Mean: 0.352627 | Grad Max: 1.621086 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.008367 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001851 | Grad Max: 0.001851 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002370 | Grad Max: 0.313346 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044691 | Grad Max: 1.751634 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.014385 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020851 | Grad Max: 0.089648 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000805 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004449 | Grad Max: 0.009155 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000321 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001318 | Grad Max: 0.003053 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001998 | Grad Max: 0.003643 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034766 | Grad Max: 0.034766 [GRADIENT NORM TOTAL] 7.1427 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.359 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5047899 0.49521014] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.525 | Std: 0.030 [MASKS] A(Pass/Fail): 513/1535 | B: 385/1663 | C: 266/1782 [LOSS Ex1] A: 0.67158 | B: 0.66662 | C: 0.66670 [LOGITS Ex2 A] Mean Abs: 1.576 | Max: 5.798 [LOSS Ex2] A: 0.20169 | B: 0.39306 | C: 0.34866 ** [JOINT LOSS] ** : 0.982770 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.041885 -> Layer: shared_layers.0.bias | Grad Mean: 0.088274 | Grad Max: 0.423648 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001903 | Grad Max: 0.006351 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002944 | Grad Max: 0.002944 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000694 | Grad Max: 0.046195 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012540 | Grad Max: 0.256276 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000137 | Grad Max: 0.004936 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005715 | Grad Max: 0.028117 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000276 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001188 | Grad Max: 0.003397 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000127 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000357 | Grad Max: 0.001138 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000687 | Grad Max: 0.001858 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010452 | Grad Max: 0.010452 [GRADIENT NORM TOTAL] 1.7715 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.246 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5229959 0.47700408] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.029 [MASKS] A(Pass/Fail): 520/1528 | B: 355/1501 | C: 269/1779 [LOSS Ex1] A: 0.67109 | B: 0.66951 | C: 0.66538 [LOGITS Ex2 A] Mean Abs: 1.608 | Max: 5.532 [LOSS Ex2] A: 0.22133 | B: 0.39239 | C: 0.34176 ** [JOINT LOSS] ** : 0.987152 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008226 | Grad Max: 0.194417 -> Layer: shared_layers.0.bias | Grad Mean: 0.499788 | Grad Max: 2.208280 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001878 | Grad Max: 0.006924 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003018 | Grad Max: 0.003018 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003342 | Grad Max: 0.277116 -> Layer: exit2_layers.0.bias | Grad Mean: 0.062650 | Grad Max: 1.465072 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000674 | Grad Max: 0.019225 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029332 | Grad Max: 0.128707 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001095 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006425 | Grad Max: 0.013907 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000455 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001937 | Grad Max: 0.004543 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003115 | Grad Max: 0.005727 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052521 | Grad Max: 0.052521 [GRADIENT NORM TOTAL] 9.6824 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.404 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59694856 0.40305147] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.030 [MASKS] A(Pass/Fail): 584/1464 | B: 367/1681 | C: 263/1785 [LOSS Ex1] A: 0.66838 | B: 0.66880 | C: 0.66593 [LOGITS Ex2 A] Mean Abs: 1.654 | Max: 5.345 [LOSS Ex2] A: 0.22611 | B: 0.45722 | C: 0.33459 ** [JOINT LOSS] ** : 1.007012 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010960 | Grad Max: 0.258725 -> Layer: shared_layers.0.bias | Grad Mean: 0.743837 | Grad Max: 3.247124 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.008782 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013555 | Grad Max: 0.013555 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004957 | Grad Max: 0.408639 -> Layer: exit2_layers.0.bias | Grad Mean: 0.093133 | Grad Max: 2.190953 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001013 | Grad Max: 0.028304 -> Layer: exit2_layers.3.bias | Grad Mean: 0.044143 | Grad Max: 0.193477 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000140 | Grad Max: 0.001728 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009633 | Grad Max: 0.020916 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000061 | Grad Max: 0.000673 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002893 | Grad Max: 0.006921 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004626 | Grad Max: 0.008165 -> Layer: exit2_layers.12.bias | Grad Mean: 0.078353 | Grad Max: 0.078353 [GRADIENT NORM TOTAL] 14.3904 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.451 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003653 0.49963468] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 543/1505 | B: 369/1679 | C: 193/1183 [LOSS Ex1] A: 0.67188 | B: 0.66934 | C: 0.66198 [LOGITS Ex2 A] Mean Abs: 1.656 | Max: 6.279 [LOSS Ex2] A: 0.20175 | B: 0.43172 | C: 0.33425 ** [JOINT LOSS] ** : 0.990303 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006146 | Grad Max: 0.180212 -> Layer: shared_layers.0.bias | Grad Mean: 0.527581 | Grad Max: 2.361727 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001982 | Grad Max: 0.006453 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004165 | Grad Max: 0.004165 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003384 | Grad Max: 0.320426 -> Layer: exit2_layers.0.bias | Grad Mean: 0.063873 | Grad Max: 1.821201 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000689 | Grad Max: 0.020652 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030227 | Grad Max: 0.128045 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.001169 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006550 | Grad Max: 0.013938 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000459 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001956 | Grad Max: 0.004711 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002907 | Grad Max: 0.005433 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051238 | Grad Max: 0.051238 [GRADIENT NORM TOTAL] 10.3666 [EPOCH SUMMARY] Train Loss: 0.9849 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9570 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9620 -> New: 0.9570) ############################## EPOCH 60/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.238 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5865094 0.41349065] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 538/1510 | B: 389/1659 | C: 258/1790 [LOSS Ex1] A: 0.66990 | B: 0.66643 | C: 0.66543 [LOGITS Ex2 A] Mean Abs: 1.605 | Max: 5.482 [LOSS Ex2] A: 0.22016 | B: 0.38872 | C: 0.31757 ** [JOINT LOSS] ** : 0.976067 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.053151 -> Layer: shared_layers.0.bias | Grad Mean: 0.134163 | Grad Max: 0.637508 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.007869 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003879 | Grad Max: 0.003879 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000946 | Grad Max: 0.108384 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017019 | Grad Max: 0.605516 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000164 | Grad Max: 0.005506 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007140 | Grad Max: 0.034309 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000305 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001528 | Grad Max: 0.003849 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000134 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000466 | Grad Max: 0.001183 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000718 | Grad Max: 0.002206 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013481 | Grad Max: 0.013481 [GRADIENT NORM TOTAL] 2.7932 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.315 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5538697 0.4461303] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.030 [MASKS] A(Pass/Fail): 454/1162 | B: 359/1497 | C: 270/1778 [LOSS Ex1] A: 0.66882 | B: 0.66933 | C: 0.66527 [LOGITS Ex2 A] Mean Abs: 1.595 | Max: 5.643 [LOSS Ex2] A: 0.21534 | B: 0.40566 | C: 0.33941 ** [JOINT LOSS] ** : 0.987943 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008758 | Grad Max: 0.211108 -> Layer: shared_layers.0.bias | Grad Mean: 0.594218 | Grad Max: 2.431098 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001967 | Grad Max: 0.007737 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000863 | Grad Max: 0.000863 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003821 | Grad Max: 0.391829 -> Layer: exit2_layers.0.bias | Grad Mean: 0.071978 | Grad Max: 2.161278 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000784 | Grad Max: 0.020520 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034116 | Grad Max: 0.132039 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000108 | Grad Max: 0.001327 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007432 | Grad Max: 0.016487 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000535 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002225 | Grad Max: 0.005170 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003511 | Grad Max: 0.006379 -> Layer: exit2_layers.12.bias | Grad Mean: 0.060207 | Grad Max: 0.060207 [GRADIENT NORM TOTAL] 11.3327 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.452 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50600886 0.49399117] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 578/1470 | B: 370/1678 | C: 280/1768 [LOSS Ex1] A: 0.66900 | B: 0.66863 | C: 0.66265 [LOGITS Ex2 A] Mean Abs: 1.582 | Max: 7.112 [LOSS Ex2] A: 0.23550 | B: 0.47492 | C: 0.37092 ** [JOINT LOSS] ** : 1.027207 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014241 | Grad Max: 0.323871 -> Layer: shared_layers.0.bias | Grad Mean: 0.851079 | Grad Max: 3.555363 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.008217 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005260 | Grad Max: 0.005260 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005572 | Grad Max: 0.528082 -> Layer: exit2_layers.0.bias | Grad Mean: 0.105000 | Grad Max: 2.904001 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001132 | Grad Max: 0.027226 -> Layer: exit2_layers.3.bias | Grad Mean: 0.049238 | Grad Max: 0.184093 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000157 | Grad Max: 0.001818 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010772 | Grad Max: 0.022694 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000069 | Grad Max: 0.000785 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003250 | Grad Max: 0.007317 -> Layer: exit2_layers.12.weight | Grad Mean: 0.005095 | Grad Max: 0.010904 -> Layer: exit2_layers.12.bias | Grad Mean: 0.088663 | Grad Max: 0.088663 [GRADIENT NORM TOTAL] 16.2341 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.413 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5055204 0.49447954] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 560/1488 | B: 370/1678 | C: 269/1779 [LOSS Ex1] A: 0.66778 | B: 0.66918 | C: 0.66570 [LOGITS Ex2 A] Mean Abs: 1.564 | Max: 5.905 [LOSS Ex2] A: 0.21821 | B: 0.45370 | C: 0.36443 ** [JOINT LOSS] ** : 1.012996 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011302 | Grad Max: 0.253828 -> Layer: shared_layers.0.bias | Grad Mean: 0.775810 | Grad Max: 3.210457 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.008239 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006923 | Grad Max: 0.006923 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004942 | Grad Max: 0.486739 -> Layer: exit2_layers.0.bias | Grad Mean: 0.093722 | Grad Max: 2.667166 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001013 | Grad Max: 0.028469 -> Layer: exit2_layers.3.bias | Grad Mean: 0.044303 | Grad Max: 0.185407 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000140 | Grad Max: 0.001610 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009672 | Grad Max: 0.020095 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000061 | Grad Max: 0.000671 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002912 | Grad Max: 0.006660 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004499 | Grad Max: 0.008952 -> Layer: exit2_layers.12.bias | Grad Mean: 0.078906 | Grad Max: 0.078906 [GRADIENT NORM TOTAL] 14.8139 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.425 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50172675 0.49827328] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.030 [MASKS] A(Pass/Fail): 546/1502 | B: 390/1658 | C: 278/1770 [LOSS Ex1] A: 0.66687 | B: 0.66626 | C: 0.66422 [LOGITS Ex2 A] Mean Abs: 1.581 | Max: 6.668 [LOSS Ex2] A: 0.23090 | B: 0.41120 | C: 0.33313 ** [JOINT LOSS] ** : 0.990860 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005342 | Grad Max: 0.120591 -> Layer: shared_layers.0.bias | Grad Mean: 0.387335 | Grad Max: 1.611042 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002194 | Grad Max: 0.008668 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007160 | Grad Max: 0.007160 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002417 | Grad Max: 0.284920 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045666 | Grad Max: 1.596665 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000492 | Grad Max: 0.013835 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021626 | Grad Max: 0.095068 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000743 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004687 | Grad Max: 0.009810 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000316 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001404 | Grad Max: 0.003188 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002189 | Grad Max: 0.004163 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038192 | Grad Max: 0.038192 [GRADIENT NORM TOTAL] 7.4954 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.360 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5047223 0.4952777] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 521/1527 | B: 359/1497 | C: 275/1773 [LOSS Ex1] A: 0.67134 | B: 0.66918 | C: 0.66329 [LOGITS Ex2 A] Mean Abs: 1.606 | Max: 6.146 [LOSS Ex2] A: 0.21673 | B: 0.37825 | C: 0.31826 ** [JOINT LOSS] ** : 0.972349 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004846 | Grad Max: 0.123933 -> Layer: shared_layers.0.bias | Grad Mean: 0.232595 | Grad Max: 0.993687 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001965 | Grad Max: 0.007458 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004075 | Grad Max: 0.004075 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001588 | Grad Max: 0.168211 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030045 | Grad Max: 0.872708 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000335 | Grad Max: 0.010894 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014534 | Grad Max: 0.067395 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000585 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003169 | Grad Max: 0.007046 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000235 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000959 | Grad Max: 0.002209 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001542 | Grad Max: 0.003585 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026691 | Grad Max: 0.026691 [GRADIENT NORM TOTAL] 4.5482 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.062 | Max: 0.248 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5232282 0.4767718] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.525 | Std: 0.029 [MASKS] A(Pass/Fail): 527/1521 | B: 373/1675 | C: 249/1799 [LOSS Ex1] A: 0.67087 | B: 0.66849 | C: 0.66591 [LOGITS Ex2 A] Mean Abs: 1.615 | Max: 6.501 [LOSS Ex2] A: 0.22981 | B: 0.43196 | C: 0.32766 ** [JOINT LOSS] ** : 0.998231 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008789 | Grad Max: 0.213821 -> Layer: shared_layers.0.bias | Grad Mean: 0.500862 | Grad Max: 2.172618 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001941 | Grad Max: 0.007395 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002658 | Grad Max: 0.002658 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003311 | Grad Max: 0.290745 -> Layer: exit2_layers.0.bias | Grad Mean: 0.062490 | Grad Max: 1.485112 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000664 | Grad Max: 0.017013 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028982 | Grad Max: 0.117468 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001102 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006348 | Grad Max: 0.013703 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000472 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001902 | Grad Max: 0.004635 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002958 | Grad Max: 0.005133 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050610 | Grad Max: 0.050610 [GRADIENT NORM TOTAL] 9.5075 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.407 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59814197 0.40185803] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.031 [MASKS] A(Pass/Fail): 589/1459 | B: 370/1678 | C: 264/1784 [LOSS Ex1] A: 0.66813 | B: 0.66904 | C: 0.66537 [LOGITS Ex2 A] Mean Abs: 1.639 | Max: 6.037 [LOSS Ex2] A: 0.21414 | B: 0.41335 | C: 0.34664 ** [JOINT LOSS] ** : 0.992224 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006722 | Grad Max: 0.178380 -> Layer: shared_layers.0.bias | Grad Mean: 0.356520 | Grad Max: 1.542742 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001990 | Grad Max: 0.007594 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000943 | Grad Max: 0.000943 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002455 | Grad Max: 0.214602 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046073 | Grad Max: 1.146435 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000490 | Grad Max: 0.013665 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021337 | Grad Max: 0.087586 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000813 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004699 | Grad Max: 0.010187 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000350 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001411 | Grad Max: 0.003347 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002116 | Grad Max: 0.003998 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036655 | Grad Max: 0.036655 [GRADIENT NORM TOTAL] 6.9180 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.454 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50032395 0.49967608] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 547/1501 | B: 392/1656 | C: 256/1792 [LOSS Ex1] A: 0.67167 | B: 0.66612 | C: 0.66788 [LOGITS Ex2 A] Mean Abs: 1.606 | Max: 5.532 [LOSS Ex2] A: 0.19655 | B: 0.38827 | C: 0.31583 ** [JOINT LOSS] ** : 0.968771 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002375 | Grad Max: 0.064160 -> Layer: shared_layers.0.bias | Grad Mean: 0.053162 | Grad Max: 0.259987 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001920 | Grad Max: 0.007269 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004116 | Grad Max: 0.004116 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000504 | Grad Max: 0.065639 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008512 | Grad Max: 0.358803 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.004105 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002850 | Grad Max: 0.015876 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000229 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000657 | Grad Max: 0.003065 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000127 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000192 | Grad Max: 0.000747 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000582 | Grad Max: 0.001724 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004392 | Grad Max: 0.004392 [GRADIENT NORM TOTAL] 1.3006 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.241 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.587352 0.41264802] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.031 [MASKS] A(Pass/Fail): 542/1506 | B: 360/1496 | C: 289/1759 [LOSS Ex1] A: 0.66968 | B: 0.66904 | C: 0.66191 [LOGITS Ex2 A] Mean Abs: 1.588 | Max: 5.810 [LOSS Ex2] A: 0.21474 | B: 0.38569 | C: 0.33787 ** [JOINT LOSS] ** : 0.979643 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002421 | Grad Max: 0.054899 -> Layer: shared_layers.0.bias | Grad Mean: 0.176122 | Grad Max: 0.769383 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.007947 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004843 | Grad Max: 0.004843 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001130 | Grad Max: 0.116294 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020902 | Grad Max: 0.645543 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000221 | Grad Max: 0.006299 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009664 | Grad Max: 0.040351 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000400 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002111 | Grad Max: 0.005332 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000178 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000626 | Grad Max: 0.001644 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000905 | Grad Max: 0.002067 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015400 | Grad Max: 0.015400 [GRADIENT NORM TOTAL] 3.3746 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.319 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5543638 0.4456362] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 455/1161 | B: 373/1675 | C: 271/1777 [LOSS Ex1] A: 0.66861 | B: 0.66835 | C: 0.66246 [LOGITS Ex2 A] Mean Abs: 1.644 | Max: 5.310 [LOSS Ex2] A: 0.19792 | B: 0.41175 | C: 0.31257 ** [JOINT LOSS] ** : 0.973890 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001880 | Grad Max: 0.045277 -> Layer: shared_layers.0.bias | Grad Mean: 0.037093 | Grad Max: 0.155444 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.007906 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000681 | Grad Max: 0.000681 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000360 | Grad Max: 0.097521 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006009 | Grad Max: 0.532692 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002434 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001250 | Grad Max: 0.012139 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000175 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000206 | Grad Max: 0.001347 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000064 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000061 | Grad Max: 0.000398 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000337 | Grad Max: 0.001022 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000475 | Grad Max: 0.000475 [GRADIENT NORM TOTAL] 1.1833 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.455 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059968 0.49400315] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 580/1468 | B: 372/1676 | C: 254/1794 [LOSS Ex1] A: 0.66878 | B: 0.66888 | C: 0.66530 [LOGITS Ex2 A] Mean Abs: 1.638 | Max: 7.406 [LOSS Ex2] A: 0.20388 | B: 0.41100 | C: 0.31929 ** [JOINT LOSS] ** : 0.979043 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002934 | Grad Max: 0.075179 -> Layer: shared_layers.0.bias | Grad Mean: 0.147261 | Grad Max: 0.664273 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002002 | Grad Max: 0.007454 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003332 | Grad Max: 0.003332 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001055 | Grad Max: 0.105473 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019213 | Grad Max: 0.586025 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000196 | Grad Max: 0.006449 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008555 | Grad Max: 0.039479 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000392 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001907 | Grad Max: 0.004942 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000161 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000585 | Grad Max: 0.001655 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000921 | Grad Max: 0.002889 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016173 | Grad Max: 0.016173 [GRADIENT NORM TOTAL] 2.9799 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.415 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5055698 0.49443012] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 564/1484 | B: 396/1652 | C: 261/1787 [LOSS Ex1] A: 0.66754 | B: 0.66593 | C: 0.66457 [LOGITS Ex2 A] Mean Abs: 1.607 | Max: 5.472 [LOSS Ex2] A: 0.20255 | B: 0.39556 | C: 0.30912 ** [JOINT LOSS] ** : 0.968422 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001592 | Grad Max: 0.043004 -> Layer: shared_layers.0.bias | Grad Mean: 0.041761 | Grad Max: 0.331004 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.008911 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012408 | Grad Max: 0.012408 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000338 | Grad Max: 0.169745 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005659 | Grad Max: 0.954903 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.002796 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001186 | Grad Max: 0.010472 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000197 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000222 | Grad Max: 0.002018 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000065 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000081 | Grad Max: 0.000428 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000478 | Grad Max: 0.001298 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002226 | Grad Max: 0.002226 [GRADIENT NORM TOTAL] 1.4045 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.427 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017212 0.49827883] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 550/1498 | B: 366/1490 | C: 182/1194 [LOSS Ex1] A: 0.66662 | B: 0.66884 | C: 0.66653 [LOGITS Ex2 A] Mean Abs: 1.597 | Max: 5.854 [LOSS Ex2] A: 0.21991 | B: 0.39792 | C: 0.32311 ** [JOINT LOSS] ** : 0.980979 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003332 | Grad Max: 0.064914 -> Layer: shared_layers.0.bias | Grad Mean: 0.212949 | Grad Max: 0.827534 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.008647 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009351 | Grad Max: 0.009351 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001315 | Grad Max: 0.169734 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024678 | Grad Max: 0.954115 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000255 | Grad Max: 0.007687 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011237 | Grad Max: 0.047972 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000456 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002404 | Grad Max: 0.005412 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000216 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000706 | Grad Max: 0.001834 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001089 | Grad Max: 0.002303 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018264 | Grad Max: 0.018264 [GRADIENT NORM TOTAL] 3.9248 [EPOCH SUMMARY] Train Loss: 0.9863 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9541 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9570 -> New: 0.9541) ############################## EPOCH 61/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.362 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50470597 0.49529406] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 522/1526 | B: 380/1668 | C: 274/1774 [LOSS Ex1] A: 0.67111 | B: 0.66813 | C: 0.66381 [LOGITS Ex2 A] Mean Abs: 1.591 | Max: 5.748 [LOSS Ex2] A: 0.20720 | B: 0.41035 | C: 0.34098 ** [JOINT LOSS] ** : 0.987195 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003410 | Grad Max: 0.103932 -> Layer: shared_layers.0.bias | Grad Mean: 0.084690 | Grad Max: 0.322226 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001955 | Grad Max: 0.006844 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002698 | Grad Max: 0.002698 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000700 | Grad Max: 0.115629 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012365 | Grad Max: 0.631008 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000136 | Grad Max: 0.004077 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005736 | Grad Max: 0.023896 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000323 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001320 | Grad Max: 0.003633 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000136 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000409 | Grad Max: 0.001215 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000774 | Grad Max: 0.001873 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011919 | Grad Max: 0.011919 [GRADIENT NORM TOTAL] 1.7879 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.251 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5234204 0.4765796] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 529/1519 | B: 380/1668 | C: 289/1759 [LOSS Ex1] A: 0.67063 | B: 0.66866 | C: 0.66351 [LOGITS Ex2 A] Mean Abs: 1.595 | Max: 5.997 [LOSS Ex2] A: 0.20792 | B: 0.42305 | C: 0.32380 ** [JOINT LOSS] ** : 0.985856 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006030 | Grad Max: 0.143572 -> Layer: shared_layers.0.bias | Grad Mean: 0.426551 | Grad Max: 1.998412 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001916 | Grad Max: 0.007274 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000620 | Grad Max: 0.000620 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002802 | Grad Max: 0.258407 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052280 | Grad Max: 1.436124 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000550 | Grad Max: 0.015260 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024260 | Grad Max: 0.104074 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000890 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005337 | Grad Max: 0.011090 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000389 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001597 | Grad Max: 0.003924 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002345 | Grad Max: 0.004931 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041468 | Grad Max: 0.041468 [GRADIENT NORM TOTAL] 8.3590 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.412 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59945875 0.40054125] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.031 [MASKS] A(Pass/Fail): 594/1454 | B: 407/1641 | C: 257/1791 [LOSS Ex1] A: 0.66785 | B: 0.66569 | C: 0.66558 [LOGITS Ex2 A] Mean Abs: 1.651 | Max: 5.911 [LOSS Ex2] A: 0.21277 | B: 0.41561 | C: 0.36073 ** [JOINT LOSS] ** : 0.996078 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008454 | Grad Max: 0.200331 -> Layer: shared_layers.0.bias | Grad Mean: 0.584588 | Grad Max: 2.582797 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002179 | Grad Max: 0.008589 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011922 | Grad Max: 0.011922 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003906 | Grad Max: 0.356968 -> Layer: exit2_layers.0.bias | Grad Mean: 0.073203 | Grad Max: 1.996109 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000785 | Grad Max: 0.023323 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034471 | Grad Max: 0.154390 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000107 | Grad Max: 0.001239 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007537 | Grad Max: 0.015832 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000505 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002258 | Grad Max: 0.005228 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003527 | Grad Max: 0.006425 -> Layer: exit2_layers.12.bias | Grad Mean: 0.060669 | Grad Max: 0.060669 [GRADIENT NORM TOTAL] 11.4713 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.459 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50030273 0.49969724] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 553/1495 | B: 370/1486 | C: 285/1763 [LOSS Ex1] A: 0.67141 | B: 0.66863 | C: 0.66124 [LOGITS Ex2 A] Mean Abs: 1.646 | Max: 5.442 [LOSS Ex2] A: 0.19524 | B: 0.39629 | C: 0.31459 ** [JOINT LOSS] ** : 0.969136 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004748 | Grad Max: 0.123601 -> Layer: shared_layers.0.bias | Grad Mean: 0.332837 | Grad Max: 1.502105 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.007556 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008100 | Grad Max: 0.008100 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.233401 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042324 | Grad Max: 1.315708 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000457 | Grad Max: 0.013564 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020198 | Grad Max: 0.089369 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000741 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004348 | Grad Max: 0.009093 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000330 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001305 | Grad Max: 0.003218 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002025 | Grad Max: 0.004415 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035625 | Grad Max: 0.035625 [GRADIENT NORM TOTAL] 6.6992 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.245 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5883854 0.41161466] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 545/1503 | B: 388/1660 | C: 248/1800 [LOSS Ex1] A: 0.66939 | B: 0.66793 | C: 0.66534 [LOGITS Ex2 A] Mean Abs: 1.599 | Max: 6.337 [LOSS Ex2] A: 0.20643 | B: 0.42129 | C: 0.33090 ** [JOINT LOSS] ** : 0.987094 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003372 | Grad Max: 0.075008 -> Layer: shared_layers.0.bias | Grad Mean: 0.243396 | Grad Max: 1.115784 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001977 | Grad Max: 0.007364 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000132 | Grad Max: 0.000132 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001619 | Grad Max: 0.121927 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030350 | Grad Max: 0.674740 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000328 | Grad Max: 0.009789 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014472 | Grad Max: 0.059059 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000602 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003150 | Grad Max: 0.007063 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000225 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000935 | Grad Max: 0.002164 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001386 | Grad Max: 0.002591 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024300 | Grad Max: 0.024300 [GRADIENT NORM TOTAL] 4.7002 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.324 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55503696 0.44496307] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.031 [MASKS] A(Pass/Fail): 455/1161 | B: 386/1662 | C: 256/1792 [LOSS Ex1] A: 0.66831 | B: 0.66847 | C: 0.66587 [LOGITS Ex2 A] Mean Abs: 1.621 | Max: 5.633 [LOSS Ex2] A: 0.20719 | B: 0.41813 | C: 0.31017 ** [JOINT LOSS] ** : 0.979377 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006963 | Grad Max: 0.185997 -> Layer: shared_layers.0.bias | Grad Mean: 0.426402 | Grad Max: 1.934838 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001984 | Grad Max: 0.007556 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002714 | Grad Max: 0.002714 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002784 | Grad Max: 0.239688 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052777 | Grad Max: 1.366287 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000578 | Grad Max: 0.015299 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025421 | Grad Max: 0.102793 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000954 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005572 | Grad Max: 0.012334 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000394 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001655 | Grad Max: 0.003871 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002551 | Grad Max: 0.004431 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043796 | Grad Max: 0.043796 [GRADIENT NORM TOTAL] 8.1694 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.461 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50599927 0.49400067] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 581/1467 | B: 411/1637 | C: 277/1771 [LOSS Ex1] A: 0.66848 | B: 0.66550 | C: 0.66317 [LOGITS Ex2 A] Mean Abs: 1.629 | Max: 6.915 [LOSS Ex2] A: 0.21290 | B: 0.39463 | C: 0.30231 ** [JOINT LOSS] ** : 0.968995 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002588 | Grad Max: 0.066732 -> Layer: shared_layers.0.bias | Grad Mean: 0.163984 | Grad Max: 0.863400 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.007547 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001604 | Grad Max: 0.001604 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001075 | Grad Max: 0.147098 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020027 | Grad Max: 0.840229 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000217 | Grad Max: 0.006943 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009487 | Grad Max: 0.043191 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000404 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002049 | Grad Max: 0.005072 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000188 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000607 | Grad Max: 0.001623 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000945 | Grad Max: 0.002301 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015277 | Grad Max: 0.015277 [GRADIENT NORM TOTAL] 3.3292 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.418 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50557816 0.49442187] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 566/1482 | B: 372/1484 | C: 273/1775 [LOSS Ex1] A: 0.66723 | B: 0.66845 | C: 0.66419 [LOGITS Ex2 A] Mean Abs: 1.656 | Max: 5.785 [LOSS Ex2] A: 0.21166 | B: 0.39260 | C: 0.30824 ** [JOINT LOSS] ** : 0.970793 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007123 | Grad Max: 0.159653 -> Layer: shared_layers.0.bias | Grad Mean: 0.316597 | Grad Max: 1.518245 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.008494 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010917 | Grad Max: 0.010917 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002240 | Grad Max: 0.218808 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041786 | Grad Max: 1.156738 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000458 | Grad Max: 0.012216 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019927 | Grad Max: 0.080652 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000777 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004397 | Grad Max: 0.009819 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000305 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001328 | Grad Max: 0.003092 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002139 | Grad Max: 0.004450 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036675 | Grad Max: 0.036675 [GRADIENT NORM TOTAL] 6.2014 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.430 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50170904 0.498291 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 556/1492 | B: 393/1655 | C: 263/1785 [LOSS Ex1] A: 0.66632 | B: 0.66776 | C: 0.66313 [LOGITS Ex2 A] Mean Abs: 1.672 | Max: 6.341 [LOSS Ex2] A: 0.22774 | B: 0.41651 | C: 0.34280 ** [JOINT LOSS] ** : 0.994750 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008461 | Grad Max: 0.235956 -> Layer: shared_layers.0.bias | Grad Mean: 0.436706 | Grad Max: 2.080117 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.008593 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005887 | Grad Max: 0.005887 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003016 | Grad Max: 0.298752 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056378 | Grad Max: 1.628642 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000606 | Grad Max: 0.014412 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026661 | Grad Max: 0.101341 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000996 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005892 | Grad Max: 0.012602 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000428 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001762 | Grad Max: 0.004192 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002722 | Grad Max: 0.004819 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046405 | Grad Max: 0.046405 [GRADIENT NORM TOTAL] 8.5025 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.364 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5046973 0.49530262] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.031 [MASKS] A(Pass/Fail): 528/1520 | B: 392/1656 | C: 266/1782 [LOSS Ex1] A: 0.67085 | B: 0.66830 | C: 0.66530 [LOGITS Ex2 A] Mean Abs: 1.624 | Max: 6.068 [LOSS Ex2] A: 0.20739 | B: 0.40190 | C: 0.32156 ** [JOINT LOSS] ** : 0.978434 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003661 | Grad Max: 0.089028 -> Layer: shared_layers.0.bias | Grad Mean: 0.183158 | Grad Max: 0.864889 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001881 | Grad Max: 0.006260 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004116 | Grad Max: 0.004116 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001241 | Grad Max: 0.173727 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023264 | Grad Max: 0.951807 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000231 | Grad Max: 0.007417 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010194 | Grad Max: 0.045133 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000414 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002216 | Grad Max: 0.005353 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000181 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000662 | Grad Max: 0.001654 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000911 | Grad Max: 0.002579 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016683 | Grad Max: 0.016683 [GRADIENT NORM TOTAL] 3.8058 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.254 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5235718 0.4764282] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 539/1509 | B: 413/1635 | C: 260/1788 [LOSS Ex1] A: 0.67038 | B: 0.66533 | C: 0.66468 [LOGITS Ex2 A] Mean Abs: 1.541 | Max: 6.278 [LOSS Ex2] A: 0.21199 | B: 0.40255 | C: 0.31076 ** [JOINT LOSS] ** : 0.975227 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004339 | Grad Max: 0.114428 -> Layer: shared_layers.0.bias | Grad Mean: 0.305227 | Grad Max: 1.396050 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.007761 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005191 | Grad Max: 0.005191 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001986 | Grad Max: 0.202137 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037390 | Grad Max: 1.138028 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000414 | Grad Max: 0.011993 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018318 | Grad Max: 0.079839 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000702 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003993 | Grad Max: 0.008840 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000308 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001178 | Grad Max: 0.002934 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001732 | Grad Max: 0.003188 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029935 | Grad Max: 0.029935 [GRADIENT NORM TOTAL] 5.8765 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.414 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6006655 0.3993345] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 596/1452 | B: 378/1478 | C: 282/1766 [LOSS Ex1] A: 0.66757 | B: 0.66829 | C: 0.66360 [LOGITS Ex2 A] Mean Abs: 1.583 | Max: 5.829 [LOSS Ex2] A: 0.20567 | B: 0.40055 | C: 0.32285 ** [JOINT LOSS] ** : 0.976180 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005610 | Grad Max: 0.132878 -> Layer: shared_layers.0.bias | Grad Mean: 0.432371 | Grad Max: 1.945987 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002205 | Grad Max: 0.008866 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015609 | Grad Max: 0.015609 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002635 | Grad Max: 0.297135 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050247 | Grad Max: 1.658512 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000546 | Grad Max: 0.014312 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024216 | Grad Max: 0.095575 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000833 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005253 | Grad Max: 0.011180 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000406 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001551 | Grad Max: 0.003848 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002215 | Grad Max: 0.004273 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039348 | Grad Max: 0.039348 [GRADIENT NORM TOTAL] 8.0861 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.463 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003233 0.49967673] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 560/1488 | B: 398/1650 | C: 280/1768 [LOSS Ex1] A: 0.67117 | B: 0.66761 | C: 0.66393 [LOGITS Ex2 A] Mean Abs: 1.589 | Max: 5.582 [LOSS Ex2] A: 0.19942 | B: 0.41859 | C: 0.31202 ** [JOINT LOSS] ** : 0.977578 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005204 | Grad Max: 0.155215 -> Layer: shared_layers.0.bias | Grad Mean: 0.279352 | Grad Max: 1.231132 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001985 | Grad Max: 0.007383 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004907 | Grad Max: 0.004907 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001779 | Grad Max: 0.183455 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032635 | Grad Max: 0.996078 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000354 | Grad Max: 0.009035 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015535 | Grad Max: 0.065851 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000652 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003416 | Grad Max: 0.007667 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000237 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001018 | Grad Max: 0.002378 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001544 | Grad Max: 0.003015 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026198 | Grad Max: 0.026198 [GRADIENT NORM TOTAL] 5.1777 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.248 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58926284 0.4107372 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 552/1496 | B: 395/1653 | C: 180/1196 [LOSS Ex1] A: 0.66914 | B: 0.66816 | C: 0.66629 [LOGITS Ex2 A] Mean Abs: 1.635 | Max: 5.634 [LOSS Ex2] A: 0.21501 | B: 0.41096 | C: 0.33055 ** [JOINT LOSS] ** : 0.986704 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003116 | Grad Max: 0.080792 -> Layer: shared_layers.0.bias | Grad Mean: 0.231696 | Grad Max: 0.988680 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001921 | Grad Max: 0.007572 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000599 | Grad Max: 0.000599 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001561 | Grad Max: 0.156320 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029138 | Grad Max: 0.876193 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000312 | Grad Max: 0.009917 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013742 | Grad Max: 0.060381 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000521 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002971 | Grad Max: 0.006770 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000216 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000880 | Grad Max: 0.002217 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001241 | Grad Max: 0.002815 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022739 | Grad Max: 0.022739 [GRADIENT NORM TOTAL] 4.7420 [EPOCH SUMMARY] Train Loss: 0.9810 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9639 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 62/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.328 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55551356 0.4444864 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.031 [MASKS] A(Pass/Fail): 459/1157 | B: 418/1630 | C: 286/1762 [LOSS Ex1] A: 0.66806 | B: 0.66518 | C: 0.66121 [LOGITS Ex2 A] Mean Abs: 1.682 | Max: 5.555 [LOSS Ex2] A: 0.20190 | B: 0.39534 | C: 0.31858 ** [JOINT LOSS] ** : 0.970089 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004672 | Grad Max: 0.141840 -> Layer: shared_layers.0.bias | Grad Mean: 0.427139 | Grad Max: 1.902636 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.008201 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004249 | Grad Max: 0.004249 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002680 | Grad Max: 0.258729 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050545 | Grad Max: 1.428795 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000545 | Grad Max: 0.014782 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024303 | Grad Max: 0.099855 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000918 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005289 | Grad Max: 0.011766 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000361 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001580 | Grad Max: 0.003660 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002362 | Grad Max: 0.004921 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042158 | Grad Max: 0.042158 [GRADIENT NORM TOTAL] 8.3005 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.463 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5060262 0.49397376] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 587/1461 | B: 384/1472 | C: 272/1776 [LOSS Ex1] A: 0.66823 | B: 0.66815 | C: 0.66441 [LOGITS Ex2 A] Mean Abs: 1.662 | Max: 7.002 [LOSS Ex2] A: 0.20998 | B: 0.38252 | C: 0.31234 ** [JOINT LOSS] ** : 0.968544 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002613 | Grad Max: 0.088022 -> Layer: shared_layers.0.bias | Grad Mean: 0.208232 | Grad Max: 0.996737 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.007718 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005571 | Grad Max: 0.005571 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001341 | Grad Max: 0.128303 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024935 | Grad Max: 0.714305 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.009170 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012618 | Grad Max: 0.055017 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000542 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002726 | Grad Max: 0.006497 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000234 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000819 | Grad Max: 0.002073 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001192 | Grad Max: 0.002897 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021598 | Grad Max: 0.021598 [GRADIENT NORM TOTAL] 4.0893 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.420 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5055786 0.49442142] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.032 [MASKS] A(Pass/Fail): 569/1479 | B: 401/1647 | C: 277/1771 [LOSS Ex1] A: 0.66698 | B: 0.66747 | C: 0.66392 [LOGITS Ex2 A] Mean Abs: 1.622 | Max: 5.443 [LOSS Ex2] A: 0.20016 | B: 0.41234 | C: 0.31261 ** [JOINT LOSS] ** : 0.974495 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003223 | Grad Max: 0.073760 -> Layer: shared_layers.0.bias | Grad Mean: 0.239793 | Grad Max: 1.050572 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.009040 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015104 | Grad Max: 0.015104 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001571 | Grad Max: 0.194600 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029577 | Grad Max: 1.103725 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000292 | Grad Max: 0.008979 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012992 | Grad Max: 0.056972 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000555 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002821 | Grad Max: 0.006565 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000213 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000830 | Grad Max: 0.002025 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001208 | Grad Max: 0.002449 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020772 | Grad Max: 0.020772 [GRADIENT NORM TOTAL] 4.8856 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.431 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017111 0.49828896] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 561/1487 | B: 396/1652 | C: 257/1791 [LOSS Ex1] A: 0.66607 | B: 0.66802 | C: 0.66650 [LOGITS Ex2 A] Mean Abs: 1.587 | Max: 6.243 [LOSS Ex2] A: 0.21899 | B: 0.42542 | C: 0.29740 ** [JOINT LOSS] ** : 0.980797 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005431 | Grad Max: 0.135799 -> Layer: shared_layers.0.bias | Grad Mean: 0.411576 | Grad Max: 1.835365 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.008376 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010651 | Grad Max: 0.010651 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002631 | Grad Max: 0.274307 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050083 | Grad Max: 1.497591 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.015730 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023674 | Grad Max: 0.105641 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000806 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005110 | Grad Max: 0.011215 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000413 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001518 | Grad Max: 0.004025 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002345 | Grad Max: 0.004035 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040229 | Grad Max: 0.040229 [GRADIENT NORM TOTAL] 8.0445 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.366 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5046788 0.49532127] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.031 [MASKS] A(Pass/Fail): 529/1519 | B: 423/1625 | C: 246/1802 [LOSS Ex1] A: 0.67064 | B: 0.66504 | C: 0.66698 [LOGITS Ex2 A] Mean Abs: 1.588 | Max: 5.931 [LOSS Ex2] A: 0.20606 | B: 0.40972 | C: 0.32926 ** [JOINT LOSS] ** : 0.982566 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004801 | Grad Max: 0.114700 -> Layer: shared_layers.0.bias | Grad Mean: 0.266806 | Grad Max: 1.183149 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001991 | Grad Max: 0.007138 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005608 | Grad Max: 0.005608 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001800 | Grad Max: 0.230934 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033563 | Grad Max: 1.258522 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000355 | Grad Max: 0.010695 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015644 | Grad Max: 0.063422 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000637 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003433 | Grad Max: 0.007763 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000273 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001017 | Grad Max: 0.002672 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001520 | Grad Max: 0.002833 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025975 | Grad Max: 0.025975 [GRADIENT NORM TOTAL] 5.3675 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.063 | Max: 0.257 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52372235 0.47627765] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.030 [MASKS] A(Pass/Fail): 540/1508 | B: 384/1472 | C: 285/1763 [LOSS Ex1] A: 0.67018 | B: 0.66801 | C: 0.66219 [LOGITS Ex2 A] Mean Abs: 1.594 | Max: 6.006 [LOSS Ex2] A: 0.21074 | B: 0.38570 | C: 0.35279 ** [JOINT LOSS] ** : 0.983204 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005134 | Grad Max: 0.141973 -> Layer: shared_layers.0.bias | Grad Mean: 0.225472 | Grad Max: 1.005183 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001955 | Grad Max: 0.007210 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003135 | Grad Max: 0.003135 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001579 | Grad Max: 0.140153 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029241 | Grad Max: 0.791694 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.008830 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013394 | Grad Max: 0.053441 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000631 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002976 | Grad Max: 0.007075 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000236 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000891 | Grad Max: 0.002281 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001352 | Grad Max: 0.002928 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023235 | Grad Max: 0.023235 [GRADIENT NORM TOTAL] 4.4129 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.416 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6021155 0.39788446] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 598/1450 | B: 403/1645 | C: 296/1752 [LOSS Ex1] A: 0.66735 | B: 0.66734 | C: 0.66251 [LOGITS Ex2 A] Mean Abs: 1.647 | Max: 6.281 [LOSS Ex2] A: 0.21212 | B: 0.41280 | C: 0.31965 ** [JOINT LOSS] ** : 0.980588 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006336 | Grad Max: 0.175567 -> Layer: shared_layers.0.bias | Grad Mean: 0.329693 | Grad Max: 1.466299 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002089 | Grad Max: 0.007683 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002187 | Grad Max: 0.002187 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.153442 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041590 | Grad Max: 0.850766 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000435 | Grad Max: 0.010272 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019142 | Grad Max: 0.070866 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000700 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004246 | Grad Max: 0.008985 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000327 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001264 | Grad Max: 0.003140 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001821 | Grad Max: 0.003672 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032122 | Grad Max: 0.032122 [GRADIENT NORM TOTAL] 6.2304 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.465 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002693 0.49973068] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 564/1484 | B: 398/1650 | C: 243/1805 [LOSS Ex1] A: 0.67097 | B: 0.66789 | C: 0.66578 [LOGITS Ex2 A] Mean Abs: 1.646 | Max: 5.469 [LOSS Ex2] A: 0.18845 | B: 0.40365 | C: 0.33101 ** [JOINT LOSS] ** : 0.975914 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002351 | Grad Max: 0.046485 -> Layer: shared_layers.0.bias | Grad Mean: 0.095490 | Grad Max: 0.418363 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001897 | Grad Max: 0.007164 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003838 | Grad Max: 0.003838 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000732 | Grad Max: 0.089209 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013058 | Grad Max: 0.491540 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000119 | Grad Max: 0.006104 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005126 | Grad Max: 0.035551 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000252 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001029 | Grad Max: 0.003357 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000098 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000299 | Grad Max: 0.000858 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000421 | Grad Max: 0.001522 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007249 | Grad Max: 0.007249 [GRADIENT NORM TOTAL] 2.0628 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.251 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.590055 0.40994498] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.032 [MASKS] A(Pass/Fail): 555/1493 | B: 426/1622 | C: 275/1773 [LOSS Ex1] A: 0.66892 | B: 0.66490 | C: 0.66501 [LOGITS Ex2 A] Mean Abs: 1.588 | Max: 6.054 [LOSS Ex2] A: 0.20951 | B: 0.41086 | C: 0.31700 ** [JOINT LOSS] ** : 0.978735 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004677 | Grad Max: 0.108538 -> Layer: shared_layers.0.bias | Grad Mean: 0.352795 | Grad Max: 1.586926 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.007783 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006131 | Grad Max: 0.006131 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002263 | Grad Max: 0.250992 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042974 | Grad Max: 1.422596 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.011867 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020414 | Grad Max: 0.083843 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000771 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004432 | Grad Max: 0.009694 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000349 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001316 | Grad Max: 0.003503 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001950 | Grad Max: 0.003593 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033807 | Grad Max: 0.033807 [GRADIENT NORM TOTAL] 6.8619 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.331 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5559937 0.44400632] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 460/1156 | B: 388/1468 | C: 291/1757 [LOSS Ex1] A: 0.66784 | B: 0.66788 | C: 0.66174 [LOGITS Ex2 A] Mean Abs: 1.609 | Max: 5.937 [LOSS Ex2] A: 0.20639 | B: 0.40408 | C: 0.33863 ** [JOINT LOSS] ** : 0.982182 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007572 | Grad Max: 0.161772 -> Layer: shared_layers.0.bias | Grad Mean: 0.482083 | Grad Max: 2.178672 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.008099 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000468 | Grad Max: 0.000468 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003202 | Grad Max: 0.316595 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059879 | Grad Max: 1.787220 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000656 | Grad Max: 0.017092 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029180 | Grad Max: 0.115420 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001040 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006434 | Grad Max: 0.013568 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000435 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001945 | Grad Max: 0.004509 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003093 | Grad Max: 0.005733 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052730 | Grad Max: 0.052730 [GRADIENT NORM TOTAL] 9.3192 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.465 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50599146 0.49400854] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 589/1459 | B: 404/1644 | C: 260/1788 [LOSS Ex1] A: 0.66801 | B: 0.66720 | C: 0.66437 [LOGITS Ex2 A] Mean Abs: 1.628 | Max: 6.907 [LOSS Ex2] A: 0.19965 | B: 0.42124 | C: 0.32714 ** [JOINT LOSS] ** : 0.982538 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003895 | Grad Max: 0.101530 -> Layer: shared_layers.0.bias | Grad Mean: 0.265214 | Grad Max: 1.253735 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002030 | Grad Max: 0.007743 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004519 | Grad Max: 0.004519 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001775 | Grad Max: 0.288979 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033530 | Grad Max: 1.624408 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000369 | Grad Max: 0.010775 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016402 | Grad Max: 0.071299 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000643 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003586 | Grad Max: 0.008449 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000263 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001077 | Grad Max: 0.002516 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001685 | Grad Max: 0.002886 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028843 | Grad Max: 0.028843 [GRADIENT NORM TOTAL] 5.3970 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.421 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056508 0.49434918] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 572/1476 | B: 401/1647 | C: 272/1776 [LOSS Ex1] A: 0.66675 | B: 0.66775 | C: 0.66278 [LOGITS Ex2 A] Mean Abs: 1.645 | Max: 6.275 [LOSS Ex2] A: 0.20762 | B: 0.40491 | C: 0.31260 ** [JOINT LOSS] ** : 0.974140 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005371 | Grad Max: 0.159175 -> Layer: shared_layers.0.bias | Grad Mean: 0.289306 | Grad Max: 1.178122 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.007985 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001520 | Grad Max: 0.001520 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001851 | Grad Max: 0.219745 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034604 | Grad Max: 1.195041 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000351 | Grad Max: 0.009959 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015490 | Grad Max: 0.065507 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000546 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003420 | Grad Max: 0.007350 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000239 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001028 | Grad Max: 0.002294 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001592 | Grad Max: 0.003608 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027463 | Grad Max: 0.027463 [GRADIENT NORM TOTAL] 5.5117 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.432 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016546 0.49834538] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 565/1483 | B: 425/1623 | C: 277/1771 [LOSS Ex1] A: 0.66584 | B: 0.66476 | C: 0.66209 [LOGITS Ex2 A] Mean Abs: 1.654 | Max: 6.035 [LOSS Ex2] A: 0.22442 | B: 0.41230 | C: 0.34424 ** [JOINT LOSS] ** : 0.991219 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008100 | Grad Max: 0.241395 -> Layer: shared_layers.0.bias | Grad Mean: 0.492087 | Grad Max: 2.066132 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.008524 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003239 | Grad Max: 0.003239 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003219 | Grad Max: 0.324119 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059797 | Grad Max: 1.739436 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000650 | Grad Max: 0.016825 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028740 | Grad Max: 0.110389 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001094 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006357 | Grad Max: 0.013876 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000468 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001915 | Grad Max: 0.004521 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002949 | Grad Max: 0.005129 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050887 | Grad Max: 0.050887 [GRADIENT NORM TOTAL] 9.3653 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.366 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50463057 0.49536943] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.526 | Std: 0.031 [MASKS] A(Pass/Fail): 531/1517 | B: 389/1467 | C: 191/1185 [LOSS Ex1] A: 0.67043 | B: 0.66774 | C: 0.66352 [LOGITS Ex2 A] Mean Abs: 1.628 | Max: 5.406 [LOSS Ex2] A: 0.20236 | B: 0.39211 | C: 0.28432 ** [JOINT LOSS] ** : 0.960164 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005802 | Grad Max: 0.132995 -> Layer: shared_layers.0.bias | Grad Mean: 0.288461 | Grad Max: 1.324068 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001924 | Grad Max: 0.006751 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000855 | Grad Max: 0.000855 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001904 | Grad Max: 0.227252 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035407 | Grad Max: 1.255945 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000374 | Grad Max: 0.009784 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016560 | Grad Max: 0.063309 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000626 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003681 | Grad Max: 0.007828 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000273 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001125 | Grad Max: 0.002677 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001657 | Grad Max: 0.004251 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030224 | Grad Max: 0.030224 [GRADIENT NORM TOTAL] 5.6014 [EPOCH SUMMARY] Train Loss: 0.9775 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9523 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9541 -> New: 0.9523) ############################## EPOCH 63/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.261 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5238829 0.4761171] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.526 | Std: 0.031 [MASKS] A(Pass/Fail): 544/1504 | B: 404/1644 | C: 252/1796 [LOSS Ex1] A: 0.66998 | B: 0.66707 | C: 0.66640 [LOGITS Ex2 A] Mean Abs: 1.547 | Max: 6.731 [LOSS Ex2] A: 0.20547 | B: 0.41405 | C: 0.35027 ** [JOINT LOSS] ** : 0.991082 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003418 | Grad Max: 0.072407 -> Layer: shared_layers.0.bias | Grad Mean: 0.180759 | Grad Max: 0.832627 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001887 | Grad Max: 0.007030 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001065 | Grad Max: 0.001065 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001298 | Grad Max: 0.154591 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024352 | Grad Max: 0.878496 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000254 | Grad Max: 0.008596 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011230 | Grad Max: 0.053872 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000466 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002463 | Grad Max: 0.005837 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000199 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000750 | Grad Max: 0.001943 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001259 | Grad Max: 0.002442 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020930 | Grad Max: 0.020930 [GRADIENT NORM TOTAL] 3.7487 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.418 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6038275 0.39617252] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.032 [MASKS] A(Pass/Fail): 602/1446 | B: 405/1643 | C: 289/1759 [LOSS Ex1] A: 0.66711 | B: 0.66762 | C: 0.66182 [LOGITS Ex2 A] Mean Abs: 1.586 | Max: 6.026 [LOSS Ex2] A: 0.19346 | B: 0.41610 | C: 0.31342 ** [JOINT LOSS] ** : 0.973178 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004490 | Grad Max: 0.099481 -> Layer: shared_layers.0.bias | Grad Mean: 0.266862 | Grad Max: 1.162144 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.008394 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009012 | Grad Max: 0.009012 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001773 | Grad Max: 0.170255 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033556 | Grad Max: 0.969582 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.009948 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015854 | Grad Max: 0.064321 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000676 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003469 | Grad Max: 0.008538 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000270 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001033 | Grad Max: 0.002625 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001573 | Grad Max: 0.003452 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026652 | Grad Max: 0.026652 [GRADIENT NORM TOTAL] 5.2216 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.467 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50024956 0.49975044] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.032 [MASKS] A(Pass/Fail): 570/1478 | B: 428/1620 | C: 269/1779 [LOSS Ex1] A: 0.67075 | B: 0.66462 | C: 0.66335 [LOGITS Ex2 A] Mean Abs: 1.613 | Max: 5.565 [LOSS Ex2] A: 0.19760 | B: 0.38829 | C: 0.30537 ** [JOINT LOSS] ** : 0.963332 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.060657 -> Layer: shared_layers.0.bias | Grad Mean: 0.040540 | Grad Max: 0.254459 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.007306 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004231 | Grad Max: 0.004231 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000385 | Grad Max: 0.125370 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006357 | Grad Max: 0.713282 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.003062 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001904 | Grad Max: 0.017588 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000173 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000429 | Grad Max: 0.001890 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000073 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000130 | Grad Max: 0.000568 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000545 | Grad Max: 0.001415 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003088 | Grad Max: 0.003088 [GRADIENT NORM TOTAL] 1.1932 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.255 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5907908 0.4092092] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.032 [MASKS] A(Pass/Fail): 559/1489 | B: 389/1467 | C: 268/1780 [LOSS Ex1] A: 0.66869 | B: 0.66760 | C: 0.66292 [LOGITS Ex2 A] Mean Abs: 1.640 | Max: 5.753 [LOSS Ex2] A: 0.21989 | B: 0.38019 | C: 0.33020 ** [JOINT LOSS] ** : 0.976496 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006986 | Grad Max: 0.146043 -> Layer: shared_layers.0.bias | Grad Mean: 0.370186 | Grad Max: 1.603212 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.007815 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005536 | Grad Max: 0.005536 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002543 | Grad Max: 0.227916 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047356 | Grad Max: 1.268313 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000496 | Grad Max: 0.013909 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021976 | Grad Max: 0.095281 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000803 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004853 | Grad Max: 0.010362 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000351 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001473 | Grad Max: 0.003389 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002325 | Grad Max: 0.004721 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040546 | Grad Max: 0.040546 [GRADIENT NORM TOTAL] 7.2348 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.335 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5564338 0.44356617] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 461/1155 | B: 405/1643 | C: 282/1766 [LOSS Ex1] A: 0.66760 | B: 0.66692 | C: 0.66322 [LOGITS Ex2 A] Mean Abs: 1.683 | Max: 5.993 [LOSS Ex2] A: 0.19697 | B: 0.42657 | C: 0.32974 ** [JOINT LOSS] ** : 0.983673 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005347 | Grad Max: 0.112464 -> Layer: shared_layers.0.bias | Grad Mean: 0.304243 | Grad Max: 1.296349 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.007082 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006336 | Grad Max: 0.006336 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.173265 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038065 | Grad Max: 0.968148 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.011808 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017743 | Grad Max: 0.073402 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000743 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003908 | Grad Max: 0.008754 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000293 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001170 | Grad Max: 0.002855 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001632 | Grad Max: 0.003127 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029754 | Grad Max: 0.029754 [GRADIENT NORM TOTAL] 5.7971 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.468 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5060019 0.4939981] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 594/1454 | B: 409/1639 | C: 270/1778 [LOSS Ex1] A: 0.66776 | B: 0.66747 | C: 0.66308 [LOGITS Ex2 A] Mean Abs: 1.624 | Max: 6.099 [LOSS Ex2] A: 0.20346 | B: 0.40577 | C: 0.33288 ** [JOINT LOSS] ** : 0.980143 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001465 | Grad Max: 0.036026 -> Layer: shared_layers.0.bias | Grad Mean: 0.029912 | Grad Max: 0.163630 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.007789 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003813 | Grad Max: 0.003813 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000353 | Grad Max: 0.041874 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005754 | Grad Max: 0.235753 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.002266 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001511 | Grad Max: 0.012922 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000176 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000288 | Grad Max: 0.001979 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000067 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000086 | Grad Max: 0.000607 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000314 | Grad Max: 0.001020 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002116 | Grad Max: 0.002116 [GRADIENT NORM TOTAL] 0.8548 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.421 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056335 0.49436656] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 572/1476 | B: 431/1617 | C: 285/1763 [LOSS Ex1] A: 0.66649 | B: 0.66444 | C: 0.66175 [LOGITS Ex2 A] Mean Abs: 1.615 | Max: 5.848 [LOSS Ex2] A: 0.19520 | B: 0.39315 | C: 0.30739 ** [JOINT LOSS] ** : 0.962806 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.043943 -> Layer: shared_layers.0.bias | Grad Mean: 0.143031 | Grad Max: 0.593507 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.008656 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011535 | Grad Max: 0.011535 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000922 | Grad Max: 0.097916 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017139 | Grad Max: 0.555872 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.006666 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008063 | Grad Max: 0.036802 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000370 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001742 | Grad Max: 0.004366 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000157 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000511 | Grad Max: 0.001622 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000729 | Grad Max: 0.002212 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012516 | Grad Max: 0.012516 [GRADIENT NORM TOTAL] 2.7759 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.433 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016708 0.49832922] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 572/1476 | B: 391/1465 | C: 265/1783 [LOSS Ex1] A: 0.66555 | B: 0.66742 | C: 0.66284 [LOGITS Ex2 A] Mean Abs: 1.626 | Max: 6.168 [LOSS Ex2] A: 0.21691 | B: 0.38754 | C: 0.31643 ** [JOINT LOSS] ** : 0.972233 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.044037 -> Layer: shared_layers.0.bias | Grad Mean: 0.116510 | Grad Max: 0.670056 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.008573 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003966 | Grad Max: 0.003966 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000763 | Grad Max: 0.108260 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013418 | Grad Max: 0.600465 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.004768 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005464 | Grad Max: 0.031309 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000254 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001113 | Grad Max: 0.003256 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000111 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000308 | Grad Max: 0.000999 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000414 | Grad Max: 0.001431 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006280 | Grad Max: 0.006280 [GRADIENT NORM TOTAL] 2.3209 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.367 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50462526 0.49537468] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.032 [MASKS] A(Pass/Fail): 536/1512 | B: 406/1642 | C: 283/1765 [LOSS Ex1] A: 0.67016 | B: 0.66673 | C: 0.66241 [LOGITS Ex2 A] Mean Abs: 1.631 | Max: 5.696 [LOSS Ex2] A: 0.20290 | B: 0.42404 | C: 0.30277 ** [JOINT LOSS] ** : 0.976337 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003973 | Grad Max: 0.111358 -> Layer: shared_layers.0.bias | Grad Mean: 0.327291 | Grad Max: 1.433501 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001960 | Grad Max: 0.006913 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000399 | Grad Max: 0.000399 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.185921 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038589 | Grad Max: 1.019331 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000425 | Grad Max: 0.012825 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019069 | Grad Max: 0.083233 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000694 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004114 | Grad Max: 0.009306 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000305 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001228 | Grad Max: 0.002853 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001749 | Grad Max: 0.003746 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032274 | Grad Max: 0.032274 [GRADIENT NORM TOTAL] 6.3311 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.064 | Max: 0.266 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5240272 0.4759728] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.031 [MASKS] A(Pass/Fail): 548/1500 | B: 410/1638 | C: 276/1772 [LOSS Ex1] A: 0.66971 | B: 0.66727 | C: 0.66204 [LOGITS Ex2 A] Mean Abs: 1.609 | Max: 5.898 [LOSS Ex2] A: 0.21544 | B: 0.40967 | C: 0.32744 ** [JOINT LOSS] ** : 0.983860 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005371 | Grad Max: 0.133389 -> Layer: shared_layers.0.bias | Grad Mean: 0.372008 | Grad Max: 1.627683 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001953 | Grad Max: 0.007013 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001831 | Grad Max: 0.001831 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002376 | Grad Max: 0.234807 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044740 | Grad Max: 1.245124 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000492 | Grad Max: 0.015950 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022040 | Grad Max: 0.102502 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000836 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004799 | Grad Max: 0.010336 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000346 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001430 | Grad Max: 0.003310 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002110 | Grad Max: 0.003891 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037262 | Grad Max: 0.037262 [GRADIENT NORM TOTAL] 7.1978 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.421 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6060464 0.3939536] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.033 [MASKS] A(Pass/Fail): 606/1442 | B: 436/1612 | C: 263/1785 [LOSS Ex1] A: 0.66679 | B: 0.66424 | C: 0.66338 [LOGITS Ex2 A] Mean Abs: 1.633 | Max: 6.175 [LOSS Ex2] A: 0.19230 | B: 0.39345 | C: 0.31856 ** [JOINT LOSS] ** : 0.966239 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001984 | Grad Max: 0.050930 -> Layer: shared_layers.0.bias | Grad Mean: 0.108426 | Grad Max: 0.397108 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.008506 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010588 | Grad Max: 0.010588 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000780 | Grad Max: 0.118868 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013883 | Grad Max: 0.670173 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000125 | Grad Max: 0.005834 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005525 | Grad Max: 0.029783 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000272 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001168 | Grad Max: 0.003049 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000119 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000359 | Grad Max: 0.001154 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000571 | Grad Max: 0.001944 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010397 | Grad Max: 0.010397 [GRADIENT NORM TOTAL] 2.3128 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.471 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002565 0.49974355] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 573/1475 | B: 395/1461 | C: 254/1794 [LOSS Ex1] A: 0.67044 | B: 0.66723 | C: 0.66510 [LOGITS Ex2 A] Mean Abs: 1.598 | Max: 5.641 [LOSS Ex2] A: 0.20093 | B: 0.40340 | C: 0.32103 ** [JOINT LOSS] ** : 0.976048 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008144 | Grad Max: 0.244861 -> Layer: shared_layers.0.bias | Grad Mean: 0.421342 | Grad Max: 1.709963 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001961 | Grad Max: 0.007373 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008018 | Grad Max: 0.008018 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002821 | Grad Max: 0.301225 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052535 | Grad Max: 1.604407 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000541 | Grad Max: 0.014866 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023948 | Grad Max: 0.097209 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000841 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005338 | Grad Max: 0.011208 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000387 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001598 | Grad Max: 0.003825 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002404 | Grad Max: 0.004416 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041473 | Grad Max: 0.041473 [GRADIENT NORM TOTAL] 8.0861 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.260 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5917732 0.40822676] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.032 [MASKS] A(Pass/Fail): 561/1487 | B: 407/1641 | C: 253/1795 [LOSS Ex1] A: 0.66835 | B: 0.66655 | C: 0.66556 [LOGITS Ex2 A] Mean Abs: 1.581 | Max: 5.668 [LOSS Ex2] A: 0.23011 | B: 0.44357 | C: 0.34152 ** [JOINT LOSS] ** : 1.005217 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010322 | Grad Max: 0.243568 -> Layer: shared_layers.0.bias | Grad Mean: 0.638289 | Grad Max: 2.608570 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.007597 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006058 | Grad Max: 0.006058 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004217 | Grad Max: 0.408090 -> Layer: exit2_layers.0.bias | Grad Mean: 0.079028 | Grad Max: 2.224020 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000830 | Grad Max: 0.021656 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036930 | Grad Max: 0.144461 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000113 | Grad Max: 0.001246 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008160 | Grad Max: 0.017399 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000589 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002443 | Grad Max: 0.005705 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003565 | Grad Max: 0.007066 -> Layer: exit2_layers.12.bias | Grad Mean: 0.063075 | Grad Max: 0.063075 [GRADIENT NORM TOTAL] 12.2394 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.340 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55704296 0.44295704] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.033 [MASKS] A(Pass/Fail): 464/1152 | B: 415/1633 | C: 198/1178 [LOSS Ex1] A: 0.66726 | B: 0.66709 | C: 0.66092 [LOGITS Ex2 A] Mean Abs: 1.627 | Max: 5.741 [LOSS Ex2] A: 0.20068 | B: 0.42338 | C: 0.32155 ** [JOINT LOSS] ** : 0.980295 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006658 | Grad Max: 0.186397 -> Layer: shared_layers.0.bias | Grad Mean: 0.432832 | Grad Max: 1.780846 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.007517 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000818 | Grad Max: 0.000818 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002885 | Grad Max: 0.249868 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053714 | Grad Max: 1.398377 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000571 | Grad Max: 0.014100 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025476 | Grad Max: 0.099661 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000927 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005619 | Grad Max: 0.012027 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000420 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001689 | Grad Max: 0.004241 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002579 | Grad Max: 0.005315 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044811 | Grad Max: 0.044811 [GRADIENT NORM TOTAL] 8.2996 [EPOCH SUMMARY] Train Loss: 0.9779 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9504 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9523 -> New: 0.9504) ############################## EPOCH 64/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.472 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059829 0.4940171] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.033 [MASKS] A(Pass/Fail): 594/1454 | B: 439/1609 | C: 279/1769 [LOSS Ex1] A: 0.66741 | B: 0.66405 | C: 0.66158 [LOGITS Ex2 A] Mean Abs: 1.653 | Max: 6.360 [LOSS Ex2] A: 0.19409 | B: 0.38771 | C: 0.31639 ** [JOINT LOSS] ** : 0.963742 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001373 | Grad Max: 0.052159 -> Layer: shared_layers.0.bias | Grad Mean: 0.090091 | Grad Max: 0.464666 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.007852 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002227 | Grad Max: 0.002227 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000611 | Grad Max: 0.071857 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010925 | Grad Max: 0.401318 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000093 | Grad Max: 0.004764 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003983 | Grad Max: 0.024810 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000235 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000844 | Grad Max: 0.002688 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000125 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000256 | Grad Max: 0.001072 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000463 | Grad Max: 0.001707 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007109 | Grad Max: 0.007109 [GRADIENT NORM TOTAL] 1.8470 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.423 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056633 0.49433675] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.033 [MASKS] A(Pass/Fail): 575/1473 | B: 397/1459 | C: 265/1783 [LOSS Ex1] A: 0.66613 | B: 0.66704 | C: 0.66383 [LOGITS Ex2 A] Mean Abs: 1.665 | Max: 5.686 [LOSS Ex2] A: 0.21539 | B: 0.38042 | C: 0.32377 ** [JOINT LOSS] ** : 0.972196 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006821 | Grad Max: 0.173036 -> Layer: shared_layers.0.bias | Grad Mean: 0.294651 | Grad Max: 1.238211 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.008104 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007306 | Grad Max: 0.007306 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001975 | Grad Max: 0.187176 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037049 | Grad Max: 0.943174 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000393 | Grad Max: 0.011077 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017315 | Grad Max: 0.067822 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000712 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003821 | Grad Max: 0.008771 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000271 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001146 | Grad Max: 0.002693 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001753 | Grad Max: 0.003767 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030163 | Grad Max: 0.030163 [GRADIENT NORM TOTAL] 5.6036 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.435 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50161874 0.49838126] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.033 [MASKS] A(Pass/Fail): 576/1472 | B: 410/1638 | C: 270/1778 [LOSS Ex1] A: 0.66518 | B: 0.66636 | C: 0.66345 [LOGITS Ex2 A] Mean Abs: 1.650 | Max: 6.144 [LOSS Ex2] A: 0.22020 | B: 0.40915 | C: 0.32496 ** [JOINT LOSS] ** : 0.983104 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005247 | Grad Max: 0.205912 -> Layer: shared_layers.0.bias | Grad Mean: 0.111106 | Grad Max: 0.477594 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.008164 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002884 | Grad Max: 0.002884 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000993 | Grad Max: 0.090218 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016902 | Grad Max: 0.490920 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000171 | Grad Max: 0.004255 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007165 | Grad Max: 0.027924 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000387 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001690 | Grad Max: 0.004398 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000162 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000517 | Grad Max: 0.001410 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000767 | Grad Max: 0.001961 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013536 | Grad Max: 0.013536 [GRADIENT NORM TOTAL] 2.4204 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.368 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5045759 0.49542406] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.032 [MASKS] A(Pass/Fail): 537/1511 | B: 419/1629 | C: 289/1759 [LOSS Ex1] A: 0.66983 | B: 0.66689 | C: 0.66032 [LOGITS Ex2 A] Mean Abs: 1.586 | Max: 5.867 [LOSS Ex2] A: 0.19636 | B: 0.41305 | C: 0.31849 ** [JOINT LOSS] ** : 0.974979 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004315 | Grad Max: 0.126521 -> Layer: shared_layers.0.bias | Grad Mean: 0.360309 | Grad Max: 1.601705 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.006573 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003510 | Grad Max: 0.003510 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002362 | Grad Max: 0.233227 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044870 | Grad Max: 1.325794 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000478 | Grad Max: 0.013584 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021560 | Grad Max: 0.090611 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000743 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004668 | Grad Max: 0.009927 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000326 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001388 | Grad Max: 0.003341 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002155 | Grad Max: 0.004013 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036989 | Grad Max: 0.036989 [GRADIENT NORM TOTAL] 7.0441 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.065 | Max: 0.272 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5242624 0.47573757] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.032 [MASKS] A(Pass/Fail): 551/1497 | B: 439/1609 | C: 276/1772 [LOSS Ex1] A: 0.66939 | B: 0.66384 | C: 0.66235 [LOGITS Ex2 A] Mean Abs: 1.560 | Max: 6.873 [LOSS Ex2] A: 0.20839 | B: 0.40562 | C: 0.32955 ** [JOINT LOSS] ** : 0.979714 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006563 | Grad Max: 0.153790 -> Layer: shared_layers.0.bias | Grad Mean: 0.404300 | Grad Max: 1.715205 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.006794 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003836 | Grad Max: 0.003836 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002674 | Grad Max: 0.264209 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050376 | Grad Max: 1.489743 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000538 | Grad Max: 0.014805 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024021 | Grad Max: 0.097063 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000804 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005275 | Grad Max: 0.010630 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000364 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001575 | Grad Max: 0.003699 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002457 | Grad Max: 0.004625 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041224 | Grad Max: 0.041224 [GRADIENT NORM TOTAL] 7.7638 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.424 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6085036 0.3914964] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.033 [MASKS] A(Pass/Fail): 611/1437 | B: 400/1456 | C: 261/1787 [LOSS Ex1] A: 0.66642 | B: 0.66684 | C: 0.66343 [LOGITS Ex2 A] Mean Abs: 1.634 | Max: 5.562 [LOSS Ex2] A: 0.18856 | B: 0.38207 | C: 0.33015 ** [JOINT LOSS] ** : 0.965825 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002436 | Grad Max: 0.061099 -> Layer: shared_layers.0.bias | Grad Mean: 0.114295 | Grad Max: 0.502770 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.007975 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007418 | Grad Max: 0.007418 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000796 | Grad Max: 0.102448 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014551 | Grad Max: 0.560836 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000151 | Grad Max: 0.005070 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006639 | Grad Max: 0.033065 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000343 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001476 | Grad Max: 0.003924 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000142 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000426 | Grad Max: 0.001355 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000499 | Grad Max: 0.001640 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009426 | Grad Max: 0.009426 [GRADIENT NORM TOTAL] 2.2332 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.473 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5001995 0.4998005] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.033 [MASKS] A(Pass/Fail): 578/1470 | B: 413/1635 | C: 270/1778 [LOSS Ex1] A: 0.67011 | B: 0.66616 | C: 0.66310 [LOGITS Ex2 A] Mean Abs: 1.692 | Max: 5.499 [LOSS Ex2] A: 0.19150 | B: 0.42288 | C: 0.33940 ** [JOINT LOSS] ** : 0.984386 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004854 | Grad Max: 0.147905 -> Layer: shared_layers.0.bias | Grad Mean: 0.449796 | Grad Max: 1.938153 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001997 | Grad Max: 0.007489 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005675 | Grad Max: 0.005675 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002872 | Grad Max: 0.262422 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054171 | Grad Max: 1.455778 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000582 | Grad Max: 0.017003 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026302 | Grad Max: 0.117547 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000976 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005639 | Grad Max: 0.012200 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000398 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001657 | Grad Max: 0.004016 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002329 | Grad Max: 0.004585 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041558 | Grad Max: 0.041558 [GRADIENT NORM TOTAL] 8.8447 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.266 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5928031 0.40719688] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.033 [MASKS] A(Pass/Fail): 566/1482 | B: 423/1625 | C: 280/1768 [LOSS Ex1] A: 0.66798 | B: 0.66670 | C: 0.66142 [LOGITS Ex2 A] Mean Abs: 1.694 | Max: 6.106 [LOSS Ex2] A: 0.22705 | B: 0.43547 | C: 0.33308 ** [JOINT LOSS] ** : 0.997232 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007997 | Grad Max: 0.227594 -> Layer: shared_layers.0.bias | Grad Mean: 0.657710 | Grad Max: 2.895786 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002067 | Grad Max: 0.007498 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002468 | Grad Max: 0.002468 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004181 | Grad Max: 0.387183 -> Layer: exit2_layers.0.bias | Grad Mean: 0.079253 | Grad Max: 2.193588 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000834 | Grad Max: 0.022446 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037823 | Grad Max: 0.159007 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000113 | Grad Max: 0.001289 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008247 | Grad Max: 0.017648 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000597 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002456 | Grad Max: 0.006047 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003776 | Grad Max: 0.006590 -> Layer: exit2_layers.12.bias | Grad Mean: 0.064543 | Grad Max: 0.064543 [GRADIENT NORM TOTAL] 12.8623 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.55767846 0.4423216 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.033 [MASKS] A(Pass/Fail): 468/1148 | B: 441/1607 | C: 293/1755 [LOSS Ex1] A: 0.66690 | B: 0.66365 | C: 0.66150 [LOGITS Ex2 A] Mean Abs: 1.726 | Max: 6.572 [LOSS Ex2] A: 0.18932 | B: 0.40159 | C: 0.30712 ** [JOINT LOSS] ** : 0.963360 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005197 | Grad Max: 0.168997 -> Layer: shared_layers.0.bias | Grad Mean: 0.479700 | Grad Max: 2.174543 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.008106 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002035 | Grad Max: 0.002035 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002985 | Grad Max: 0.295514 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056476 | Grad Max: 1.629306 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000592 | Grad Max: 0.017070 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026913 | Grad Max: 0.113600 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.001024 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005855 | Grad Max: 0.012368 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000402 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001733 | Grad Max: 0.004034 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002500 | Grad Max: 0.004927 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044493 | Grad Max: 0.044493 [GRADIENT NORM TOTAL] 9.2853 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.474 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059948 0.4940052] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 594/1454 | B: 401/1455 | C: 267/1781 [LOSS Ex1] A: 0.66703 | B: 0.66666 | C: 0.66107 [LOGITS Ex2 A] Mean Abs: 1.665 | Max: 6.434 [LOSS Ex2] A: 0.19872 | B: 0.38393 | C: 0.31489 ** [JOINT LOSS] ** : 0.964096 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002390 | Grad Max: 0.079296 -> Layer: shared_layers.0.bias | Grad Mean: 0.044634 | Grad Max: 0.204423 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.007577 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006364 | Grad Max: 0.006364 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000468 | Grad Max: 0.093513 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007519 | Grad Max: 0.530388 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.003323 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001365 | Grad Max: 0.013921 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000153 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000212 | Grad Max: 0.001453 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000065 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000071 | Grad Max: 0.000458 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000436 | Grad Max: 0.001143 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001449 | Grad Max: 0.001449 [GRADIENT NORM TOTAL] 1.3178 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.424 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5057479 0.49425206] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.529 | Std: 0.033 [MASKS] A(Pass/Fail): 578/1470 | B: 414/1634 | C: 267/1781 [LOSS Ex1] A: 0.66574 | B: 0.66598 | C: 0.66196 [LOGITS Ex2 A] Mean Abs: 1.628 | Max: 5.871 [LOSS Ex2] A: 0.20410 | B: 0.43569 | C: 0.32650 ** [JOINT LOSS] ** : 0.986655 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006143 | Grad Max: 0.157549 -> Layer: shared_layers.0.bias | Grad Mean: 0.457860 | Grad Max: 2.027681 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.008370 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008642 | Grad Max: 0.008642 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002961 | Grad Max: 0.250930 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055775 | Grad Max: 1.369955 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.015128 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027480 | Grad Max: 0.107131 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.000960 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006020 | Grad Max: 0.013580 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000413 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001809 | Grad Max: 0.004199 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002772 | Grad Max: 0.005129 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047681 | Grad Max: 0.047681 [GRADIENT NORM TOTAL] 8.6972 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.436 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50161266 0.49838737] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.529 | Std: 0.033 [MASKS] A(Pass/Fail): 579/1469 | B: 426/1622 | C: 261/1787 [LOSS Ex1] A: 0.66478 | B: 0.66651 | C: 0.66320 [LOGITS Ex2 A] Mean Abs: 1.613 | Max: 6.102 [LOSS Ex2] A: 0.22154 | B: 0.42679 | C: 0.32075 ** [JOINT LOSS] ** : 0.987856 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005046 | Grad Max: 0.157832 -> Layer: shared_layers.0.bias | Grad Mean: 0.467063 | Grad Max: 2.086988 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.008558 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009818 | Grad Max: 0.009818 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002913 | Grad Max: 0.316020 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054865 | Grad Max: 1.773807 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000586 | Grad Max: 0.019645 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026592 | Grad Max: 0.124463 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.001012 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005758 | Grad Max: 0.012334 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000405 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001718 | Grad Max: 0.003968 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002574 | Grad Max: 0.004830 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044876 | Grad Max: 0.044876 [GRADIENT NORM TOTAL] 9.1866 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.369 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50455976 0.49544024] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.527 | Std: 0.033 [MASKS] A(Pass/Fail): 540/1508 | B: 446/1602 | C: 249/1799 [LOSS Ex1] A: 0.66949 | B: 0.66346 | C: 0.66296 [LOGITS Ex2 A] Mean Abs: 1.613 | Max: 5.900 [LOSS Ex2] A: 0.19341 | B: 0.39123 | C: 0.30670 ** [JOINT LOSS] ** : 0.962416 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002492 | Grad Max: 0.064327 -> Layer: shared_layers.0.bias | Grad Mean: 0.218741 | Grad Max: 0.913266 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002060 | Grad Max: 0.007162 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004665 | Grad Max: 0.004665 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001455 | Grad Max: 0.248636 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026793 | Grad Max: 1.404232 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.009028 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013029 | Grad Max: 0.056024 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000473 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002767 | Grad Max: 0.006101 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000200 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000812 | Grad Max: 0.001945 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001200 | Grad Max: 0.002744 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020505 | Grad Max: 0.020505 [GRADIENT NORM TOTAL] 4.6488 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.277 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5245012 0.47549883] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.527 | Std: 0.032 [MASKS] A(Pass/Fail): 553/1495 | B: 403/1453 | C: 155/1221 [LOSS Ex1] A: 0.66908 | B: 0.66647 | C: 0.66592 [LOGITS Ex2 A] Mean Abs: 1.635 | Max: 5.970 [LOSS Ex2] A: 0.21214 | B: 0.37845 | C: 0.33583 ** [JOINT LOSS] ** : 0.975962 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008258 | Grad Max: 0.252415 -> Layer: shared_layers.0.bias | Grad Mean: 0.337588 | Grad Max: 1.448382 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001961 | Grad Max: 0.007340 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005246 | Grad Max: 0.005246 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002406 | Grad Max: 0.219218 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044395 | Grad Max: 1.112526 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000476 | Grad Max: 0.012732 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021203 | Grad Max: 0.085858 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000829 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004751 | Grad Max: 0.011075 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000343 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001423 | Grad Max: 0.003432 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002131 | Grad Max: 0.004316 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037407 | Grad Max: 0.037407 [GRADIENT NORM TOTAL] 6.4093 [EPOCH SUMMARY] Train Loss: 0.9758 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9619 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 65/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.425 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6107488 0.38925114] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.034 [MASKS] A(Pass/Fail): 614/1434 | B: 414/1634 | C: 290/1758 [LOSS Ex1] A: 0.66605 | B: 0.66580 | C: 0.66079 [LOGITS Ex2 A] Mean Abs: 1.683 | Max: 5.455 [LOSS Ex2] A: 0.21450 | B: 0.42130 | C: 0.33217 ** [JOINT LOSS] ** : 0.986867 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011326 | Grad Max: 0.354869 -> Layer: shared_layers.0.bias | Grad Mean: 0.485410 | Grad Max: 2.038709 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.007870 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003204 | Grad Max: 0.003204 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003363 | Grad Max: 0.297169 -> Layer: exit2_layers.0.bias | Grad Mean: 0.062203 | Grad Max: 1.497937 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000646 | Grad Max: 0.014661 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028684 | Grad Max: 0.105598 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.001044 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006412 | Grad Max: 0.014522 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000445 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001914 | Grad Max: 0.004436 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002904 | Grad Max: 0.005034 -> Layer: exit2_layers.12.bias | Grad Mean: 0.049175 | Grad Max: 0.049175 [GRADIENT NORM TOTAL] 9.1639 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.475 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5001479 0.49985212] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.033 [MASKS] A(Pass/Fail): 581/1467 | B: 427/1621 | C: 251/1797 [LOSS Ex1] A: 0.66979 | B: 0.66633 | C: 0.66304 [LOGITS Ex2 A] Mean Abs: 1.651 | Max: 5.496 [LOSS Ex2] A: 0.19814 | B: 0.40233 | C: 0.31575 ** [JOINT LOSS] ** : 0.971795 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005174 | Grad Max: 0.137910 -> Layer: shared_layers.0.bias | Grad Mean: 0.283785 | Grad Max: 1.188052 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002035 | Grad Max: 0.007406 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009257 | Grad Max: 0.009257 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001922 | Grad Max: 0.178336 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036104 | Grad Max: 1.004098 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.011249 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017274 | Grad Max: 0.072557 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000598 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003821 | Grad Max: 0.008107 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000268 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001147 | Grad Max: 0.002731 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001622 | Grad Max: 0.003629 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029149 | Grad Max: 0.029149 [GRADIENT NORM TOTAL] 5.4516 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.270 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5936879 0.40631208] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.033 [MASKS] A(Pass/Fail): 569/1479 | B: 447/1601 | C: 286/1762 [LOSS Ex1] A: 0.66763 | B: 0.66328 | C: 0.66122 [LOGITS Ex2 A] Mean Abs: 1.629 | Max: 5.956 [LOSS Ex2] A: 0.20864 | B: 0.39478 | C: 0.32237 ** [JOINT LOSS] ** : 0.972640 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003762 | Grad Max: 0.098554 -> Layer: shared_layers.0.bias | Grad Mean: 0.307364 | Grad Max: 1.332000 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002115 | Grad Max: 0.007590 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001547 | Grad Max: 0.001547 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001934 | Grad Max: 0.248394 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036572 | Grad Max: 1.402051 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000383 | Grad Max: 0.010631 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017337 | Grad Max: 0.069880 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000611 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003779 | Grad Max: 0.008066 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000282 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001122 | Grad Max: 0.002862 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001690 | Grad Max: 0.003129 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028919 | Grad Max: 0.028919 [GRADIENT NORM TOTAL] 6.0970 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.349 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5582951 0.44170496] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 471/1145 | B: 404/1452 | C: 246/1802 [LOSS Ex1] A: 0.66656 | B: 0.66630 | C: 0.66306 [LOGITS Ex2 A] Mean Abs: 1.634 | Max: 5.561 [LOSS Ex2] A: 0.20386 | B: 0.40698 | C: 0.30133 ** [JOINT LOSS] ** : 0.969362 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008076 | Grad Max: 0.185066 -> Layer: shared_layers.0.bias | Grad Mean: 0.490722 | Grad Max: 2.099156 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.007842 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001632 | Grad Max: 0.001632 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003198 | Grad Max: 0.297105 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059774 | Grad Max: 1.701940 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000634 | Grad Max: 0.017274 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028454 | Grad Max: 0.115080 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000989 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006288 | Grad Max: 0.012964 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000445 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001885 | Grad Max: 0.004626 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002854 | Grad Max: 0.005078 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048802 | Grad Max: 0.048802 [GRADIENT NORM TOTAL] 9.3515 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.476 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50595844 0.49404156] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 596/1452 | B: 416/1632 | C: 265/1783 [LOSS Ex1] A: 0.66669 | B: 0.66564 | C: 0.66135 [LOGITS Ex2 A] Mean Abs: 1.645 | Max: 6.679 [LOSS Ex2] A: 0.19958 | B: 0.42054 | C: 0.33430 ** [JOINT LOSS] ** : 0.982700 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006356 | Grad Max: 0.164542 -> Layer: shared_layers.0.bias | Grad Mean: 0.343099 | Grad Max: 1.419997 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002076 | Grad Max: 0.007553 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002871 | Grad Max: 0.002871 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.229120 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042146 | Grad Max: 1.258854 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000444 | Grad Max: 0.014240 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019804 | Grad Max: 0.083138 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000738 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004398 | Grad Max: 0.009332 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000324 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001320 | Grad Max: 0.003062 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002028 | Grad Max: 0.003903 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034313 | Grad Max: 0.034313 [GRADIENT NORM TOTAL] 6.5818 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.425 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50586534 0.49413463] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 581/1467 | B: 428/1620 | C: 278/1770 [LOSS Ex1] A: 0.66540 | B: 0.66618 | C: 0.66189 [LOGITS Ex2 A] Mean Abs: 1.673 | Max: 5.582 [LOSS Ex2] A: 0.19631 | B: 0.40249 | C: 0.31929 ** [JOINT LOSS] ** : 0.970520 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002793 | Grad Max: 0.080660 -> Layer: shared_layers.0.bias | Grad Mean: 0.230138 | Grad Max: 1.095554 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.008624 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013616 | Grad Max: 0.013616 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001424 | Grad Max: 0.161714 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026212 | Grad Max: 0.904797 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000253 | Grad Max: 0.006673 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011512 | Grad Max: 0.048668 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000423 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002458 | Grad Max: 0.005846 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000193 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000720 | Grad Max: 0.001927 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000979 | Grad Max: 0.002803 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018126 | Grad Max: 0.018126 [GRADIENT NORM TOTAL] 4.5764 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.437 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015286 0.4984714] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 579/1469 | B: 447/1601 | C: 257/1791 [LOSS Ex1] A: 0.66443 | B: 0.66313 | C: 0.66173 [LOGITS Ex2 A] Mean Abs: 1.676 | Max: 7.520 [LOSS Ex2] A: 0.22187 | B: 0.39708 | C: 0.31706 ** [JOINT LOSS] ** : 0.975102 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004174 | Grad Max: 0.123480 -> Layer: shared_layers.0.bias | Grad Mean: 0.380196 | Grad Max: 1.706880 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002264 | Grad Max: 0.008275 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005990 | Grad Max: 0.005990 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002331 | Grad Max: 0.227073 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043873 | Grad Max: 1.299753 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000467 | Grad Max: 0.013448 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021236 | Grad Max: 0.091242 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000784 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004647 | Grad Max: 0.010237 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000332 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001394 | Grad Max: 0.003108 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002057 | Grad Max: 0.003928 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036292 | Grad Max: 0.036292 [GRADIENT NORM TOTAL] 7.3582 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.370 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044672 0.49553284] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.033 [MASKS] A(Pass/Fail): 543/1505 | B: 405/1451 | C: 260/1788 [LOSS Ex1] A: 0.66921 | B: 0.66615 | C: 0.66273 [LOGITS Ex2 A] Mean Abs: 1.626 | Max: 5.885 [LOSS Ex2] A: 0.19855 | B: 0.38711 | C: 0.31618 ** [JOINT LOSS] ** : 0.966642 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002666 | Grad Max: 0.075299 -> Layer: shared_layers.0.bias | Grad Mean: 0.159040 | Grad Max: 0.727755 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001919 | Grad Max: 0.006430 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004504 | Grad Max: 0.004504 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000973 | Grad Max: 0.167243 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018175 | Grad Max: 0.911852 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000179 | Grad Max: 0.006565 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007964 | Grad Max: 0.038868 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000338 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001701 | Grad Max: 0.004216 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000150 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000507 | Grad Max: 0.001385 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000667 | Grad Max: 0.002256 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012619 | Grad Max: 0.012619 [GRADIENT NORM TOTAL] 3.2006 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.066 | Max: 0.282 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52478623 0.47521383] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.033 [MASKS] A(Pass/Fail): 555/1493 | B: 416/1632 | C: 262/1786 [LOSS Ex1] A: 0.66881 | B: 0.66550 | C: 0.66215 [LOGITS Ex2 A] Mean Abs: 1.564 | Max: 6.027 [LOSS Ex2] A: 0.20577 | B: 0.41533 | C: 0.33257 ** [JOINT LOSS] ** : 0.983374 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005250 | Grad Max: 0.127103 -> Layer: shared_layers.0.bias | Grad Mean: 0.292457 | Grad Max: 1.323260 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001950 | Grad Max: 0.006760 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004965 | Grad Max: 0.004965 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001931 | Grad Max: 0.204102 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035989 | Grad Max: 1.087935 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.010998 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017615 | Grad Max: 0.077400 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000616 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003906 | Grad Max: 0.008304 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000295 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001193 | Grad Max: 0.002896 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001968 | Grad Max: 0.003897 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032247 | Grad Max: 0.032247 [GRADIENT NORM TOTAL] 5.5737 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.427 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6127213 0.38727874] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.034 [MASKS] A(Pass/Fail): 616/1432 | B: 432/1616 | C: 255/1793 [LOSS Ex1] A: 0.66572 | B: 0.66604 | C: 0.66272 [LOGITS Ex2 A] Mean Abs: 1.615 | Max: 6.182 [LOSS Ex2] A: 0.19649 | B: 0.40867 | C: 0.31856 ** [JOINT LOSS] ** : 0.972732 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004652 | Grad Max: 0.109814 -> Layer: shared_layers.0.bias | Grad Mean: 0.282283 | Grad Max: 1.243654 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.008141 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012052 | Grad Max: 0.012052 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001779 | Grad Max: 0.187894 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033467 | Grad Max: 1.049320 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000367 | Grad Max: 0.010031 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016498 | Grad Max: 0.067374 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000558 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003609 | Grad Max: 0.008305 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000285 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001069 | Grad Max: 0.002744 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001575 | Grad Max: 0.003071 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026654 | Grad Max: 0.026654 [GRADIENT NORM TOTAL] 5.2534 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.477 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000769 0.49992314] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 584/1464 | B: 448/1600 | C: 261/1787 [LOSS Ex1] A: 0.66950 | B: 0.66299 | C: 0.66066 [LOGITS Ex2 A] Mean Abs: 1.647 | Max: 5.302 [LOSS Ex2] A: 0.19025 | B: 0.38215 | C: 0.32054 ** [JOINT LOSS] ** : 0.962027 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001848 | Grad Max: 0.050177 -> Layer: shared_layers.0.bias | Grad Mean: 0.030202 | Grad Max: 0.149376 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002088 | Grad Max: 0.007192 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002996 | Grad Max: 0.002996 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000357 | Grad Max: 0.089195 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005642 | Grad Max: 0.506701 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002381 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001275 | Grad Max: 0.010614 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000152 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000213 | Grad Max: 0.001276 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000061 | Grad Max: 0.000434 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000439 | Grad Max: 0.001134 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000618 | Grad Max: 0.000618 [GRADIENT NORM TOTAL] 1.0387 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.274 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59447926 0.40552074] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 569/1479 | B: 406/1450 | C: 275/1773 [LOSS Ex1] A: 0.66732 | B: 0.66601 | C: 0.66347 [LOGITS Ex2 A] Mean Abs: 1.645 | Max: 6.164 [LOSS Ex2] A: 0.21023 | B: 0.37742 | C: 0.33211 ** [JOINT LOSS] ** : 0.972188 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005388 | Grad Max: 0.154283 -> Layer: shared_layers.0.bias | Grad Mean: 0.252529 | Grad Max: 1.070743 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002028 | Grad Max: 0.007785 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006217 | Grad Max: 0.006217 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001790 | Grad Max: 0.147843 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032840 | Grad Max: 0.823122 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000352 | Grad Max: 0.009009 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015822 | Grad Max: 0.061945 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000617 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003538 | Grad Max: 0.007784 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000264 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001072 | Grad Max: 0.002522 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001744 | Grad Max: 0.003501 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029079 | Grad Max: 0.029079 [GRADIENT NORM TOTAL] 4.8649 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.352 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5594049 0.44059503] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.034 [MASKS] A(Pass/Fail): 471/1145 | B: 416/1632 | C: 256/1792 [LOSS Ex1] A: 0.66624 | B: 0.66535 | C: 0.66279 [LOGITS Ex2 A] Mean Abs: 1.678 | Max: 5.801 [LOSS Ex2] A: 0.18709 | B: 0.41132 | C: 0.29795 ** [JOINT LOSS] ** : 0.963579 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004985 | Grad Max: 0.182958 -> Layer: shared_layers.0.bias | Grad Mean: 0.109540 | Grad Max: 0.456085 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.007561 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000588 | Grad Max: 0.000588 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000966 | Grad Max: 0.184735 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017485 | Grad Max: 1.031997 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000161 | Grad Max: 0.004286 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006915 | Grad Max: 0.025382 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000321 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001586 | Grad Max: 0.004140 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000148 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000476 | Grad Max: 0.001189 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000637 | Grad Max: 0.002087 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011678 | Grad Max: 0.011678 [GRADIENT NORM TOTAL] 2.6007 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.479 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50593287 0.49406716] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.530 | Std: 0.034 [MASKS] A(Pass/Fail): 597/1451 | B: 433/1615 | C: 209/1167 [LOSS Ex1] A: 0.66636 | B: 0.66588 | C: 0.65684 [LOGITS Ex2 A] Mean Abs: 1.643 | Max: 6.297 [LOSS Ex2] A: 0.19667 | B: 0.41181 | C: 0.30431 ** [JOINT LOSS] ** : 0.967289 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002870 | Grad Max: 0.084669 -> Layer: shared_layers.0.bias | Grad Mean: 0.231597 | Grad Max: 1.115242 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.007817 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007074 | Grad Max: 0.007074 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001511 | Grad Max: 0.200498 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028080 | Grad Max: 1.135391 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000287 | Grad Max: 0.009056 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012936 | Grad Max: 0.057576 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000513 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002729 | Grad Max: 0.006147 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000215 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000803 | Grad Max: 0.002030 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001155 | Grad Max: 0.002315 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020294 | Grad Max: 0.020294 [GRADIENT NORM TOTAL] 4.7231 [EPOCH SUMMARY] Train Loss: 0.9726 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9520 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 66/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.426 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5058904 0.4941096] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 583/1465 | B: 451/1597 | C: 289/1759 [LOSS Ex1] A: 0.66503 | B: 0.66282 | C: 0.66062 [LOGITS Ex2 A] Mean Abs: 1.625 | Max: 5.893 [LOSS Ex2] A: 0.19641 | B: 0.39440 | C: 0.30985 ** [JOINT LOSS] ** : 0.963042 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004613 | Grad Max: 0.122702 -> Layer: shared_layers.0.bias | Grad Mean: 0.325293 | Grad Max: 1.479474 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.008086 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006600 | Grad Max: 0.006600 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.241020 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038822 | Grad Max: 1.350279 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.012913 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017879 | Grad Max: 0.083006 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000634 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003894 | Grad Max: 0.008235 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000292 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001156 | Grad Max: 0.002994 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001744 | Grad Max: 0.003357 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029494 | Grad Max: 0.029494 [GRADIENT NORM TOTAL] 6.3582 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.438 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50151885 0.49848112] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 581/1467 | B: 409/1447 | C: 259/1789 [LOSS Ex1] A: 0.66406 | B: 0.66585 | C: 0.66252 [LOGITS Ex2 A] Mean Abs: 1.627 | Max: 7.640 [LOSS Ex2] A: 0.21492 | B: 0.37393 | C: 0.31606 ** [JOINT LOSS] ** : 0.965781 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003029 | Grad Max: 0.079194 -> Layer: shared_layers.0.bias | Grad Mean: 0.124509 | Grad Max: 0.506105 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.008619 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007569 | Grad Max: 0.007569 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000838 | Grad Max: 0.117702 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014464 | Grad Max: 0.659862 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000134 | Grad Max: 0.006740 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005728 | Grad Max: 0.032125 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000321 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001115 | Grad Max: 0.004009 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000136 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000313 | Grad Max: 0.000992 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000461 | Grad Max: 0.001547 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006672 | Grad Max: 0.006672 [GRADIENT NORM TOTAL] 2.4229 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.371 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044358 0.49556422] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.034 [MASKS] A(Pass/Fail): 543/1505 | B: 420/1628 | C: 266/1782 [LOSS Ex1] A: 0.66888 | B: 0.66518 | C: 0.66025 [LOGITS Ex2 A] Mean Abs: 1.646 | Max: 6.049 [LOSS Ex2] A: 0.19212 | B: 0.42919 | C: 0.32158 ** [JOINT LOSS] ** : 0.979069 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005735 | Grad Max: 0.137934 -> Layer: shared_layers.0.bias | Grad Mean: 0.425242 | Grad Max: 1.817919 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001988 | Grad Max: 0.006637 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000691 | Grad Max: 0.000691 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002706 | Grad Max: 0.264196 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051141 | Grad Max: 1.433863 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000537 | Grad Max: 0.014830 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024363 | Grad Max: 0.104707 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000847 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005314 | Grad Max: 0.011783 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000380 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001588 | Grad Max: 0.003788 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002327 | Grad Max: 0.004529 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041467 | Grad Max: 0.041467 [GRADIENT NORM TOTAL] 8.1912 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.067 | Max: 0.287 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5249352 0.47506478] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.033 [MASKS] A(Pass/Fail): 556/1492 | B: 434/1614 | C: 263/1785 [LOSS Ex1] A: 0.66850 | B: 0.66572 | C: 0.66075 [LOGITS Ex2 A] Mean Abs: 1.646 | Max: 6.123 [LOSS Ex2] A: 0.21378 | B: 0.42975 | C: 0.34315 ** [JOINT LOSS] ** : 0.993883 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006632 | Grad Max: 0.179074 -> Layer: shared_layers.0.bias | Grad Mean: 0.541468 | Grad Max: 2.397496 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001991 | Grad Max: 0.006964 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006098 | Grad Max: 0.006098 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003439 | Grad Max: 0.332695 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064670 | Grad Max: 1.837749 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000676 | Grad Max: 0.021379 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030847 | Grad Max: 0.142544 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001071 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006748 | Grad Max: 0.014312 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000466 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002006 | Grad Max: 0.004935 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002841 | Grad Max: 0.004987 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050227 | Grad Max: 0.050227 [GRADIENT NORM TOTAL] 10.5979 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.430 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6149688 0.3850312] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 617/1431 | B: 452/1596 | C: 286/1762 [LOSS Ex1] A: 0.66534 | B: 0.66265 | C: 0.66098 [LOGITS Ex2 A] Mean Abs: 1.672 | Max: 6.089 [LOSS Ex2] A: 0.19479 | B: 0.39525 | C: 0.33066 ** [JOINT LOSS] ** : 0.969892 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003024 | Grad Max: 0.136895 -> Layer: shared_layers.0.bias | Grad Mean: 0.335298 | Grad Max: 1.683821 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.007760 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007050 | Grad Max: 0.007050 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.219189 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038780 | Grad Max: 1.228159 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000396 | Grad Max: 0.012215 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018139 | Grad Max: 0.085413 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000662 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003909 | Grad Max: 0.008486 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000256 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001171 | Grad Max: 0.002692 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001693 | Grad Max: 0.003577 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030838 | Grad Max: 0.030838 [GRADIENT NORM TOTAL] 6.7727 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.481 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50004524 0.49995473] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 587/1461 | B: 411/1445 | C: 276/1772 [LOSS Ex1] A: 0.66916 | B: 0.66569 | C: 0.65896 [LOGITS Ex2 A] Mean Abs: 1.641 | Max: 5.521 [LOSS Ex2] A: 0.18994 | B: 0.38504 | C: 0.31102 ** [JOINT LOSS] ** : 0.959936 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006520 | Grad Max: 0.234159 -> Layer: shared_layers.0.bias | Grad Mean: 0.183907 | Grad Max: 0.628653 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.007060 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003001 | Grad Max: 0.003001 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001370 | Grad Max: 0.153331 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024446 | Grad Max: 0.845876 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000244 | Grad Max: 0.006328 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010801 | Grad Max: 0.037834 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000573 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002535 | Grad Max: 0.006797 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000209 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000762 | Grad Max: 0.001930 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001231 | Grad Max: 0.002622 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020158 | Grad Max: 0.020158 [GRADIENT NORM TOTAL] 3.5400 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.279 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59598196 0.404018 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 573/1475 | B: 426/1622 | C: 275/1773 [LOSS Ex1] A: 0.66696 | B: 0.66503 | C: 0.66052 [LOGITS Ex2 A] Mean Abs: 1.634 | Max: 5.933 [LOSS Ex2] A: 0.21221 | B: 0.41559 | C: 0.29685 ** [JOINT LOSS] ** : 0.972388 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006524 | Grad Max: 0.204694 -> Layer: shared_layers.0.bias | Grad Mean: 0.261879 | Grad Max: 0.959455 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.007912 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007991 | Grad Max: 0.007991 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001823 | Grad Max: 0.195136 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033419 | Grad Max: 1.085478 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000344 | Grad Max: 0.009271 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015318 | Grad Max: 0.061272 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000612 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003449 | Grad Max: 0.008124 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000239 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001019 | Grad Max: 0.002519 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001386 | Grad Max: 0.002927 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024394 | Grad Max: 0.024394 [GRADIENT NORM TOTAL] 5.0210 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.357 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5607465 0.43925354] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 475/1141 | B: 435/1613 | C: 262/1786 [LOSS Ex1] A: 0.66589 | B: 0.66557 | C: 0.66196 [LOGITS Ex2 A] Mean Abs: 1.684 | Max: 5.843 [LOSS Ex2] A: 0.17392 | B: 0.40210 | C: 0.32217 ** [JOINT LOSS] ** : 0.963873 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003314 | Grad Max: 0.103221 -> Layer: shared_layers.0.bias | Grad Mean: 0.048429 | Grad Max: 0.213898 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.007038 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003554 | Grad Max: 0.003554 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000491 | Grad Max: 0.048068 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007927 | Grad Max: 0.217881 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000078 | Grad Max: 0.003103 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003092 | Grad Max: 0.017648 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000171 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000745 | Grad Max: 0.002318 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000116 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000241 | Grad Max: 0.000979 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000579 | Grad Max: 0.001735 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008147 | Grad Max: 0.008147 [GRADIENT NORM TOTAL] 1.1839 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.482 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059108 0.4940892] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 598/1450 | B: 454/1594 | C: 244/1804 [LOSS Ex1] A: 0.66601 | B: 0.66249 | C: 0.66194 [LOGITS Ex2 A] Mean Abs: 1.693 | Max: 6.582 [LOSS Ex2] A: 0.21052 | B: 0.39825 | C: 0.33048 ** [JOINT LOSS] ** : 0.976564 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008877 | Grad Max: 0.236564 -> Layer: shared_layers.0.bias | Grad Mean: 0.432353 | Grad Max: 1.828140 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.007801 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004902 | Grad Max: 0.004902 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002877 | Grad Max: 0.278094 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053580 | Grad Max: 1.413169 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000556 | Grad Max: 0.014159 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025043 | Grad Max: 0.098580 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000859 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005559 | Grad Max: 0.011790 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000384 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001667 | Grad Max: 0.003931 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002462 | Grad Max: 0.004738 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043154 | Grad Max: 0.043154 [GRADIENT NORM TOTAL] 8.2773 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.428 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059334 0.4940666] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 583/1465 | B: 411/1445 | C: 270/1778 [LOSS Ex1] A: 0.66467 | B: 0.66553 | C: 0.65925 [LOGITS Ex2 A] Mean Abs: 1.711 | Max: 5.995 [LOSS Ex2] A: 0.22271 | B: 0.38954 | C: 0.30329 ** [JOINT LOSS] ** : 0.968337 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012668 | Grad Max: 0.367307 -> Layer: shared_layers.0.bias | Grad Mean: 0.518921 | Grad Max: 2.362865 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.008428 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009079 | Grad Max: 0.009079 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003529 | Grad Max: 0.350315 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065864 | Grad Max: 1.723576 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000695 | Grad Max: 0.019574 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031258 | Grad Max: 0.126353 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000096 | Grad Max: 0.001087 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006991 | Grad Max: 0.014853 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000456 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002104 | Grad Max: 0.004701 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003209 | Grad Max: 0.006171 -> Layer: exit2_layers.12.bias | Grad Mean: 0.055452 | Grad Max: 0.055452 [GRADIENT NORM TOTAL] 9.7655 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.441 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015107 0.49848932] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 581/1467 | B: 427/1621 | C: 277/1771 [LOSS Ex1] A: 0.66371 | B: 0.66487 | C: 0.66150 [LOGITS Ex2 A] Mean Abs: 1.687 | Max: 6.553 [LOSS Ex2] A: 0.24194 | B: 0.41068 | C: 0.32364 ** [JOINT LOSS] ** : 0.988781 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011318 | Grad Max: 0.412218 -> Layer: shared_layers.0.bias | Grad Mean: 0.285211 | Grad Max: 1.134109 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002143 | Grad Max: 0.008066 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004585 | Grad Max: 0.004585 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.204541 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037840 | Grad Max: 0.894880 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000400 | Grad Max: 0.008820 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017541 | Grad Max: 0.059673 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000715 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004088 | Grad Max: 0.009000 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000307 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001243 | Grad Max: 0.003192 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001956 | Grad Max: 0.003426 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033137 | Grad Max: 0.033137 [GRADIENT NORM TOTAL] 5.4319 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.373 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044551 0.4955449] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.528 | Std: 0.034 [MASKS] A(Pass/Fail): 544/1504 | B: 436/1612 | C: 251/1797 [LOSS Ex1] A: 0.66858 | B: 0.66541 | C: 0.66269 [LOGITS Ex2 A] Mean Abs: 1.610 | Max: 6.349 [LOSS Ex2] A: 0.19685 | B: 0.40768 | C: 0.31119 ** [JOINT LOSS] ** : 0.970801 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004767 | Grad Max: 0.123688 -> Layer: shared_layers.0.bias | Grad Mean: 0.332387 | Grad Max: 1.615269 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001945 | Grad Max: 0.006715 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000568 | Grad Max: 0.000568 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002194 | Grad Max: 0.190674 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039335 | Grad Max: 1.076508 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000377 | Grad Max: 0.011808 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017127 | Grad Max: 0.079561 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000596 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003570 | Grad Max: 0.007995 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000268 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001039 | Grad Max: 0.002662 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001436 | Grad Max: 0.002825 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025627 | Grad Max: 0.025627 [GRADIENT NORM TOTAL] 6.7263 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.291 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.525393 0.474607] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.528 | Std: 0.033 [MASKS] A(Pass/Fail): 557/1491 | B: 455/1593 | C: 252/1796 [LOSS Ex1] A: 0.66821 | B: 0.66233 | C: 0.66472 [LOGITS Ex2 A] Mean Abs: 1.549 | Max: 5.825 [LOSS Ex2] A: 0.20649 | B: 0.42296 | C: 0.33631 ** [JOINT LOSS] ** : 0.987010 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004738 | Grad Max: 0.153617 -> Layer: shared_layers.0.bias | Grad Mean: 0.461979 | Grad Max: 1.983481 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.007579 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005645 | Grad Max: 0.005645 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002963 | Grad Max: 0.325120 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055059 | Grad Max: 1.801094 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000577 | Grad Max: 0.018141 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026458 | Grad Max: 0.126090 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000979 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005626 | Grad Max: 0.012845 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000398 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001667 | Grad Max: 0.004155 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002265 | Grad Max: 0.004588 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042347 | Grad Max: 0.042347 [GRADIENT NORM TOTAL] 9.2701 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.433 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61684036 0.38315967] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.531 | Std: 0.035 [MASKS] A(Pass/Fail): 619/1429 | B: 413/1443 | C: 173/1203 [LOSS Ex1] A: 0.66503 | B: 0.66539 | C: 0.66079 [LOGITS Ex2 A] Mean Abs: 1.619 | Max: 5.710 [LOSS Ex2] A: 0.19746 | B: 0.39914 | C: 0.34484 ** [JOINT LOSS] ** : 0.977549 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004294 | Grad Max: 0.168766 -> Layer: shared_layers.0.bias | Grad Mean: 0.482466 | Grad Max: 2.182149 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.007658 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002675 | Grad Max: 0.002675 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003043 | Grad Max: 0.333194 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056788 | Grad Max: 1.887400 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000593 | Grad Max: 0.017666 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027114 | Grad Max: 0.120609 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.001023 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005772 | Grad Max: 0.013336 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000397 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001709 | Grad Max: 0.004127 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002373 | Grad Max: 0.004370 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043563 | Grad Max: 0.043563 [GRADIENT NORM TOTAL] 9.7871 [EPOCH SUMMARY] Train Loss: 0.9741 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9458 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9504 -> New: 0.9458) ############################## EPOCH 67/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.484 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50011206 0.49988794] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.035 [MASKS] A(Pass/Fail): 587/1461 | B: 429/1619 | C: 280/1768 [LOSS Ex1] A: 0.66887 | B: 0.66473 | C: 0.66006 [LOGITS Ex2 A] Mean Abs: 1.649 | Max: 6.170 [LOSS Ex2] A: 0.18221 | B: 0.41058 | C: 0.32086 ** [JOINT LOSS] ** : 0.969099 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002399 | Grad Max: 0.053266 -> Layer: shared_layers.0.bias | Grad Mean: 0.090293 | Grad Max: 0.510388 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002012 | Grad Max: 0.006817 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003652 | Grad Max: 0.003652 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000661 | Grad Max: 0.123504 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011470 | Grad Max: 0.694173 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.003925 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002999 | Grad Max: 0.023637 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000611 | Grad Max: 0.002745 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000092 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000171 | Grad Max: 0.000709 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.001078 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003591 | Grad Max: 0.003591 [GRADIENT NORM TOTAL] 2.2006 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.283 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59740543 0.40259457] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.035 [MASKS] A(Pass/Fail): 573/1475 | B: 439/1609 | C: 272/1776 [LOSS Ex1] A: 0.66664 | B: 0.66527 | C: 0.65910 [LOGITS Ex2 A] Mean Abs: 1.695 | Max: 5.659 [LOSS Ex2] A: 0.21086 | B: 0.41786 | C: 0.31818 ** [JOINT LOSS] ** : 0.979304 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006643 | Grad Max: 0.162922 -> Layer: shared_layers.0.bias | Grad Mean: 0.415641 | Grad Max: 1.863426 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.006968 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002163 | Grad Max: 0.002163 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002748 | Grad Max: 0.247129 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051581 | Grad Max: 1.394622 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.014685 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024551 | Grad Max: 0.101955 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000894 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005409 | Grad Max: 0.012159 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000374 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001611 | Grad Max: 0.003818 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002296 | Grad Max: 0.004399 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041366 | Grad Max: 0.041366 [GRADIENT NORM TOTAL] 8.2999 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.361 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.56159204 0.438408 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 476/1140 | B: 456/1592 | C: 274/1774 [LOSS Ex1] A: 0.66555 | B: 0.66219 | C: 0.66025 [LOGITS Ex2 A] Mean Abs: 1.728 | Max: 5.599 [LOSS Ex2] A: 0.20553 | B: 0.40699 | C: 0.34919 ** [JOINT LOSS] ** : 0.983234 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006499 | Grad Max: 0.206623 -> Layer: shared_layers.0.bias | Grad Mean: 0.579609 | Grad Max: 2.650167 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.008080 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004595 | Grad Max: 0.004595 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003705 | Grad Max: 0.329762 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070167 | Grad Max: 1.857152 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000738 | Grad Max: 0.022392 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033847 | Grad Max: 0.158605 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001200 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007356 | Grad Max: 0.016042 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000475 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002201 | Grad Max: 0.005158 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003162 | Grad Max: 0.005390 -> Layer: exit2_layers.12.bias | Grad Mean: 0.056716 | Grad Max: 0.056716 [GRADIENT NORM TOTAL] 11.4410 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.485 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5060037 0.49399635] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 598/1450 | B: 413/1443 | C: 265/1783 [LOSS Ex1] A: 0.66569 | B: 0.66526 | C: 0.66199 [LOGITS Ex2 A] Mean Abs: 1.667 | Max: 7.011 [LOSS Ex2] A: 0.20253 | B: 0.38794 | C: 0.31756 ** [JOINT LOSS] ** : 0.966986 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003505 | Grad Max: 0.143750 -> Layer: shared_layers.0.bias | Grad Mean: 0.358644 | Grad Max: 1.764086 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.007630 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007420 | Grad Max: 0.007420 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.218800 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041860 | Grad Max: 1.228267 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000426 | Grad Max: 0.013466 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019660 | Grad Max: 0.087307 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000673 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004236 | Grad Max: 0.009195 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000289 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001269 | Grad Max: 0.003020 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001723 | Grad Max: 0.003859 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032660 | Grad Max: 0.032660 [GRADIENT NORM TOTAL] 7.2667 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.431 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50599015 0.49400988] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 583/1465 | B: 434/1614 | C: 256/1792 [LOSS Ex1] A: 0.66432 | B: 0.66459 | C: 0.66202 [LOGITS Ex2 A] Mean Abs: 1.638 | Max: 5.875 [LOSS Ex2] A: 0.20249 | B: 0.40693 | C: 0.30385 ** [JOINT LOSS] ** : 0.968068 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005448 | Grad Max: 0.149128 -> Layer: shared_layers.0.bias | Grad Mean: 0.153919 | Grad Max: 0.564789 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.008775 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013267 | Grad Max: 0.013267 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001107 | Grad Max: 0.215564 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020138 | Grad Max: 1.214542 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000195 | Grad Max: 0.005319 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008398 | Grad Max: 0.032701 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000401 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001910 | Grad Max: 0.004815 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000171 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000565 | Grad Max: 0.001606 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000888 | Grad Max: 0.002230 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013739 | Grad Max: 0.013739 [GRADIENT NORM TOTAL] 3.2810 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.444 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015388 0.4984612] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 581/1467 | B: 439/1609 | C: 301/1747 [LOSS Ex1] A: 0.66337 | B: 0.66514 | C: 0.65909 [LOGITS Ex2 A] Mean Abs: 1.623 | Max: 7.703 [LOSS Ex2] A: 0.21622 | B: 0.40846 | C: 0.31260 ** [JOINT LOSS] ** : 0.974964 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006846 | Grad Max: 0.202690 -> Layer: shared_layers.0.bias | Grad Mean: 0.275538 | Grad Max: 1.169728 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.007515 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003143 | Grad Max: 0.003143 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001834 | Grad Max: 0.235046 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034830 | Grad Max: 1.329097 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.010348 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016415 | Grad Max: 0.070580 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000567 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003640 | Grad Max: 0.008354 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000266 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001089 | Grad Max: 0.002577 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001739 | Grad Max: 0.003037 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028716 | Grad Max: 0.028716 [GRADIENT NORM TOTAL] 5.2766 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.376 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50445807 0.4955419 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 545/1503 | B: 458/1590 | C: 249/1799 [LOSS Ex1] A: 0.66830 | B: 0.66205 | C: 0.66119 [LOGITS Ex2 A] Mean Abs: 1.621 | Max: 6.035 [LOSS Ex2] A: 0.20018 | B: 0.38381 | C: 0.32631 ** [JOINT LOSS] ** : 0.967283 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004947 | Grad Max: 0.164636 -> Layer: shared_layers.0.bias | Grad Mean: 0.094819 | Grad Max: 0.468058 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.006629 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001133 | Grad Max: 0.001133 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000844 | Grad Max: 0.063062 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014784 | Grad Max: 0.260678 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000154 | Grad Max: 0.004595 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006759 | Grad Max: 0.026568 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000320 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001587 | Grad Max: 0.004051 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000175 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000482 | Grad Max: 0.001604 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000896 | Grad Max: 0.002317 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013618 | Grad Max: 0.013618 [GRADIENT NORM TOTAL] 2.0048 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.068 | Max: 0.295 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52572364 0.4742764 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 559/1489 | B: 413/1443 | C: 271/1777 [LOSS Ex1] A: 0.66794 | B: 0.66512 | C: 0.66069 [LOGITS Ex2 A] Mean Abs: 1.631 | Max: 5.899 [LOSS Ex2] A: 0.20572 | B: 0.38284 | C: 0.30874 ** [JOINT LOSS] ** : 0.963689 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005639 | Grad Max: 0.146123 -> Layer: shared_layers.0.bias | Grad Mean: 0.434020 | Grad Max: 1.853808 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001956 | Grad Max: 0.007119 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000533 | Grad Max: 0.000533 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002838 | Grad Max: 0.306358 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053613 | Grad Max: 1.760883 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000550 | Grad Max: 0.015775 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025209 | Grad Max: 0.106029 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000819 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005522 | Grad Max: 0.011846 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000362 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001661 | Grad Max: 0.003915 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002372 | Grad Max: 0.004984 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043370 | Grad Max: 0.043370 [GRADIENT NORM TOTAL] 8.7208 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.436 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6185442 0.38145578] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.531 | Std: 0.035 [MASKS] A(Pass/Fail): 619/1429 | B: 434/1614 | C: 252/1796 [LOSS Ex1] A: 0.66472 | B: 0.66445 | C: 0.66165 [LOGITS Ex2 A] Mean Abs: 1.690 | Max: 5.372 [LOSS Ex2] A: 0.21203 | B: 0.44284 | C: 0.33142 ** [JOINT LOSS] ** : 0.992371 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012720 | Grad Max: 0.333733 -> Layer: shared_layers.0.bias | Grad Mean: 0.683264 | Grad Max: 2.889529 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.008165 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009632 | Grad Max: 0.009632 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004620 | Grad Max: 0.403848 -> Layer: exit2_layers.0.bias | Grad Mean: 0.086003 | Grad Max: 2.106580 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000891 | Grad Max: 0.023642 -> Layer: exit2_layers.3.bias | Grad Mean: 0.040547 | Grad Max: 0.162252 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000120 | Grad Max: 0.001354 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009014 | Grad Max: 0.019489 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000584 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002698 | Grad Max: 0.006226 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003864 | Grad Max: 0.006615 -> Layer: exit2_layers.12.bias | Grad Mean: 0.068802 | Grad Max: 0.068802 [GRADIENT NORM TOTAL] 13.1350 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.488 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5001039 0.4998961] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 593/1455 | B: 442/1606 | C: 283/1765 [LOSS Ex1] A: 0.66859 | B: 0.66500 | C: 0.65763 [LOGITS Ex2 A] Mean Abs: 1.694 | Max: 5.680 [LOSS Ex2] A: 0.20112 | B: 0.41550 | C: 0.32325 ** [JOINT LOSS] ** : 0.977028 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008744 | Grad Max: 0.199699 -> Layer: shared_layers.0.bias | Grad Mean: 0.472219 | Grad Max: 2.059389 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.006946 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000191 | Grad Max: 0.000191 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003140 | Grad Max: 0.311606 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058877 | Grad Max: 1.695468 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000619 | Grad Max: 0.017089 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028266 | Grad Max: 0.125269 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000972 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006240 | Grad Max: 0.013446 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000444 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001861 | Grad Max: 0.004482 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002590 | Grad Max: 0.004645 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046848 | Grad Max: 0.046848 [GRADIENT NORM TOTAL] 9.1221 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.287 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5987494 0.40125066] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 575/1473 | B: 459/1589 | C: 235/1813 [LOSS Ex1] A: 0.66635 | B: 0.66190 | C: 0.66337 [LOGITS Ex2 A] Mean Abs: 1.654 | Max: 5.646 [LOSS Ex2] A: 0.20620 | B: 0.38971 | C: 0.32076 ** [JOINT LOSS] ** : 0.969430 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004014 | Grad Max: 0.139128 -> Layer: shared_layers.0.bias | Grad Mean: 0.059880 | Grad Max: 0.232132 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.007573 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006032 | Grad Max: 0.006032 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000563 | Grad Max: 0.148873 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008982 | Grad Max: 0.840080 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.003330 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001983 | Grad Max: 0.014456 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000184 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000445 | Grad Max: 0.001741 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000077 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000143 | Grad Max: 0.000583 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000335 | Grad Max: 0.001238 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004666 | Grad Max: 0.004666 [GRADIENT NORM TOTAL] 1.7570 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.365 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5623655 0.43763456] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.531 | Std: 0.035 [MASKS] A(Pass/Fail): 476/1140 | B: 413/1443 | C: 258/1790 [LOSS Ex1] A: 0.66524 | B: 0.66497 | C: 0.66195 [LOGITS Ex2 A] Mean Abs: 1.625 | Max: 5.720 [LOSS Ex2] A: 0.18999 | B: 0.41450 | C: 0.32006 ** [JOINT LOSS] ** : 0.972239 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005479 | Grad Max: 0.181313 -> Layer: shared_layers.0.bias | Grad Mean: 0.553273 | Grad Max: 2.444166 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.007611 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002134 | Grad Max: 0.002134 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003402 | Grad Max: 0.335782 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064443 | Grad Max: 1.893171 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000696 | Grad Max: 0.020873 -> Layer: exit2_layers.3.bias | Grad Mean: 0.032160 | Grad Max: 0.145374 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001030 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006997 | Grad Max: 0.014644 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000528 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002083 | Grad Max: 0.005253 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002900 | Grad Max: 0.005392 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052937 | Grad Max: 0.052937 [GRADIENT NORM TOTAL] 10.7714 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.489 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5060092 0.49399072] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.531 | Std: 0.036 [MASKS] A(Pass/Fail): 599/1449 | B: 435/1613 | C: 269/1779 [LOSS Ex1] A: 0.66539 | B: 0.66430 | C: 0.65972 [LOGITS Ex2 A] Mean Abs: 1.615 | Max: 5.956 [LOSS Ex2] A: 0.20185 | B: 0.46517 | C: 0.34574 ** [JOINT LOSS] ** : 1.000722 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009133 | Grad Max: 0.245748 -> Layer: shared_layers.0.bias | Grad Mean: 0.747361 | Grad Max: 3.168656 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.007903 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004635 | Grad Max: 0.004635 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004697 | Grad Max: 0.458427 -> Layer: exit2_layers.0.bias | Grad Mean: 0.088538 | Grad Max: 2.608337 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000926 | Grad Max: 0.026644 -> Layer: exit2_layers.3.bias | Grad Mean: 0.042789 | Grad Max: 0.184836 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000124 | Grad Max: 0.001405 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009424 | Grad Max: 0.020174 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.000635 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002831 | Grad Max: 0.006886 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003920 | Grad Max: 0.008004 -> Layer: exit2_layers.12.bias | Grad Mean: 0.072617 | Grad Max: 0.072617 [GRADIENT NORM TOTAL] 14.4816 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.435 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50610363 0.49389634] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.530 | Std: 0.036 [MASKS] A(Pass/Fail): 585/1463 | B: 442/1606 | C: 170/1206 [LOSS Ex1] A: 0.66400 | B: 0.66485 | C: 0.66190 [LOGITS Ex2 A] Mean Abs: 1.620 | Max: 6.100 [LOSS Ex2] A: 0.19300 | B: 0.43257 | C: 0.33592 ** [JOINT LOSS] ** : 0.984079 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006243 | Grad Max: 0.191738 -> Layer: shared_layers.0.bias | Grad Mean: 0.566292 | Grad Max: 2.451530 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.007459 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004910 | Grad Max: 0.004910 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003559 | Grad Max: 0.364475 -> Layer: exit2_layers.0.bias | Grad Mean: 0.067478 | Grad Max: 2.074544 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000709 | Grad Max: 0.019414 -> Layer: exit2_layers.3.bias | Grad Mean: 0.032789 | Grad Max: 0.138471 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.001071 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007183 | Grad Max: 0.015100 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000495 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002150 | Grad Max: 0.005178 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003051 | Grad Max: 0.006012 -> Layer: exit2_layers.12.bias | Grad Mean: 0.055575 | Grad Max: 0.055575 [GRADIENT NORM TOTAL] 11.0709 [EPOCH SUMMARY] Train Loss: 0.9763 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9441 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9458 -> New: 0.9441) ############################## EPOCH 68/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.448 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015115 0.49848846] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 586/1462 | B: 459/1589 | C: 280/1768 [LOSS Ex1] A: 0.66305 | B: 0.66175 | C: 0.65826 [LOGITS Ex2 A] Mean Abs: 1.631 | Max: 7.201 [LOSS Ex2] A: 0.20463 | B: 0.38928 | C: 0.32248 ** [JOINT LOSS] ** : 0.966483 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002385 | Grad Max: 0.066120 -> Layer: shared_layers.0.bias | Grad Mean: 0.120142 | Grad Max: 0.564941 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002284 | Grad Max: 0.007813 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000584 | Grad Max: 0.000584 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000858 | Grad Max: 0.131564 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014884 | Grad Max: 0.733691 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.004560 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005930 | Grad Max: 0.030848 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000297 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001254 | Grad Max: 0.003427 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000108 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000369 | Grad Max: 0.000978 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000621 | Grad Max: 0.001658 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010125 | Grad Max: 0.010125 [GRADIENT NORM TOTAL] 2.5889 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.379 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50443184 0.49556813] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.035 [MASKS] A(Pass/Fail): 548/1500 | B: 413/1443 | C: 257/1791 [LOSS Ex1] A: 0.66802 | B: 0.66483 | C: 0.66105 [LOGITS Ex2 A] Mean Abs: 1.677 | Max: 5.549 [LOSS Ex2] A: 0.20739 | B: 0.39912 | C: 0.32727 ** [JOINT LOSS] ** : 0.975891 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007720 | Grad Max: 0.191644 -> Layer: shared_layers.0.bias | Grad Mean: 0.582976 | Grad Max: 2.519759 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001992 | Grad Max: 0.006826 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004354 | Grad Max: 0.004354 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003705 | Grad Max: 0.337070 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070283 | Grad Max: 1.867501 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000740 | Grad Max: 0.022600 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034054 | Grad Max: 0.155388 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000098 | Grad Max: 0.001039 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007465 | Grad Max: 0.015408 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000472 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002247 | Grad Max: 0.005051 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003156 | Grad Max: 0.005639 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057394 | Grad Max: 0.057395 [GRADIENT NORM TOTAL] 11.3001 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.299 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5260454 0.47395465] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.529 | Std: 0.034 [MASKS] A(Pass/Fail): 560/1488 | B: 436/1612 | C: 243/1805 [LOSS Ex1] A: 0.66767 | B: 0.66415 | C: 0.66295 [LOGITS Ex2 A] Mean Abs: 1.672 | Max: 6.633 [LOSS Ex2] A: 0.21987 | B: 0.45504 | C: 0.33638 ** [JOINT LOSS] ** : 1.002023 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011070 | Grad Max: 0.284471 -> Layer: shared_layers.0.bias | Grad Mean: 0.843160 | Grad Max: 3.741110 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001936 | Grad Max: 0.006986 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000002 | Grad Max: 0.000002 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005367 | Grad Max: 0.476806 -> Layer: exit2_layers.0.bias | Grad Mean: 0.101836 | Grad Max: 2.563154 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001065 | Grad Max: 0.032972 -> Layer: exit2_layers.3.bias | Grad Mean: 0.049219 | Grad Max: 0.226917 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000141 | Grad Max: 0.001613 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010772 | Grad Max: 0.022931 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000062 | Grad Max: 0.000693 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003221 | Grad Max: 0.007497 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004562 | Grad Max: 0.008062 -> Layer: exit2_layers.12.bias | Grad Mean: 0.082102 | Grad Max: 0.082102 [GRADIENT NORM TOTAL] 16.2230 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.439 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62028205 0.37971792] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.531 | Std: 0.036 [MASKS] A(Pass/Fail): 619/1429 | B: 443/1605 | C: 289/1759 [LOSS Ex1] A: 0.66440 | B: 0.66471 | C: 0.65837 [LOGITS Ex2 A] Mean Abs: 1.697 | Max: 6.092 [LOSS Ex2] A: 0.21249 | B: 0.45013 | C: 0.34532 ** [JOINT LOSS] ** : 0.998470 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008916 | Grad Max: 0.279665 -> Layer: shared_layers.0.bias | Grad Mean: 0.823246 | Grad Max: 3.630442 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.008610 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013536 | Grad Max: 0.013536 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005226 | Grad Max: 0.490289 -> Layer: exit2_layers.0.bias | Grad Mean: 0.098996 | Grad Max: 2.731357 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001029 | Grad Max: 0.031191 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047623 | Grad Max: 0.214221 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000136 | Grad Max: 0.001513 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010411 | Grad Max: 0.021773 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000060 | Grad Max: 0.000656 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003122 | Grad Max: 0.007424 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004386 | Grad Max: 0.007532 -> Layer: exit2_layers.12.bias | Grad Mean: 0.079350 | Grad Max: 0.079350 [GRADIENT NORM TOTAL] 16.1818 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.491 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500085 0.499915] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 598/1450 | B: 459/1589 | C: 289/1759 [LOSS Ex1] A: 0.66832 | B: 0.66161 | C: 0.65761 [LOGITS Ex2 A] Mean Abs: 1.680 | Max: 5.536 [LOSS Ex2] A: 0.19965 | B: 0.39731 | C: 0.29216 ** [JOINT LOSS] ** : 0.958886 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004611 | Grad Max: 0.147308 -> Layer: shared_layers.0.bias | Grad Mean: 0.392690 | Grad Max: 1.854511 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.007005 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003110 | Grad Max: 0.003110 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002539 | Grad Max: 0.263986 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046285 | Grad Max: 1.502786 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000466 | Grad Max: 0.015448 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021493 | Grad Max: 0.104800 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000689 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004552 | Grad Max: 0.009319 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000290 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001373 | Grad Max: 0.003139 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001925 | Grad Max: 0.004391 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036190 | Grad Max: 0.036190 [GRADIENT NORM TOTAL] 7.8991 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.291 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.60016036 0.3998396 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.036 [MASKS] A(Pass/Fail): 575/1473 | B: 413/1443 | C: 268/1780 [LOSS Ex1] A: 0.66605 | B: 0.66471 | C: 0.65846 [LOGITS Ex2 A] Mean Abs: 1.628 | Max: 5.961 [LOSS Ex2] A: 0.20960 | B: 0.37932 | C: 0.30977 ** [JOINT LOSS] ** : 0.962634 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006934 | Grad Max: 0.257121 -> Layer: shared_layers.0.bias | Grad Mean: 0.239068 | Grad Max: 0.909150 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.007303 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001624 | Grad Max: 0.001624 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001664 | Grad Max: 0.197969 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030316 | Grad Max: 0.958885 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000312 | Grad Max: 0.007571 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013929 | Grad Max: 0.052399 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000560 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003225 | Grad Max: 0.007714 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000248 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000970 | Grad Max: 0.002387 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001344 | Grad Max: 0.003051 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024323 | Grad Max: 0.024323 [GRADIENT NORM TOTAL] 4.4524 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.368 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.56327045 0.43672958] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.531 | Std: 0.036 [MASKS] A(Pass/Fail): 476/1140 | B: 437/1611 | C: 250/1798 [LOSS Ex1] A: 0.66494 | B: 0.66404 | C: 0.66263 [LOGITS Ex2 A] Mean Abs: 1.632 | Max: 5.869 [LOSS Ex2] A: 0.20380 | B: 0.41843 | C: 0.34120 ** [JOINT LOSS] ** : 0.985009 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009507 | Grad Max: 0.273048 -> Layer: shared_layers.0.bias | Grad Mean: 0.484231 | Grad Max: 2.026432 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.007666 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002615 | Grad Max: 0.002615 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003222 | Grad Max: 0.359590 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060080 | Grad Max: 1.964628 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000636 | Grad Max: 0.018524 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028909 | Grad Max: 0.128727 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.000946 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006446 | Grad Max: 0.014113 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000430 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001948 | Grad Max: 0.004641 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002802 | Grad Max: 0.005270 -> Layer: exit2_layers.12.bias | Grad Mean: 0.049928 | Grad Max: 0.049928 [GRADIENT NORM TOTAL] 9.2322 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.492 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50600916 0.4939908 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.531 | Std: 0.036 [MASKS] A(Pass/Fail): 601/1447 | B: 443/1605 | C: 258/1790 [LOSS Ex1] A: 0.66511 | B: 0.66460 | C: 0.65990 [LOGITS Ex2 A] Mean Abs: 1.642 | Max: 6.457 [LOSS Ex2] A: 0.19250 | B: 0.42016 | C: 0.30521 ** [JOINT LOSS] ** : 0.969159 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007877 | Grad Max: 0.214519 -> Layer: shared_layers.0.bias | Grad Mean: 0.383044 | Grad Max: 1.600721 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002120 | Grad Max: 0.007673 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006045 | Grad Max: 0.006045 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002506 | Grad Max: 0.260541 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046968 | Grad Max: 1.369313 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000496 | Grad Max: 0.015327 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022665 | Grad Max: 0.095142 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000771 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005064 | Grad Max: 0.010655 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000352 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001516 | Grad Max: 0.003788 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002140 | Grad Max: 0.003826 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038270 | Grad Max: 0.038270 [GRADIENT NORM TOTAL] 7.1035 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.438 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50625396 0.493746 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.531 | Std: 0.036 [MASKS] A(Pass/Fail): 585/1463 | B: 461/1587 | C: 255/1793 [LOSS Ex1] A: 0.66370 | B: 0.66150 | C: 0.66204 [LOGITS Ex2 A] Mean Abs: 1.653 | Max: 6.771 [LOSS Ex2] A: 0.18623 | B: 0.38304 | C: 0.32547 ** [JOINT LOSS] ** : 0.960660 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002324 | Grad Max: 0.061359 -> Layer: shared_layers.0.bias | Grad Mean: 0.115564 | Grad Max: 0.612808 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002174 | Grad Max: 0.007825 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003718 | Grad Max: 0.003718 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000745 | Grad Max: 0.087430 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013674 | Grad Max: 0.470574 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.004736 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005948 | Grad Max: 0.029527 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000264 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001298 | Grad Max: 0.003595 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000119 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000395 | Grad Max: 0.001188 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000557 | Grad Max: 0.002143 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010874 | Grad Max: 0.010874 [GRADIENT NORM TOTAL] 2.2727 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.451 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015183 0.49848166] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.531 | Std: 0.036 [MASKS] A(Pass/Fail): 587/1461 | B: 415/1441 | C: 278/1770 [LOSS Ex1] A: 0.66276 | B: 0.66460 | C: 0.65858 [LOGITS Ex2 A] Mean Abs: 1.652 | Max: 6.782 [LOSS Ex2] A: 0.21619 | B: 0.38163 | C: 0.31844 ** [JOINT LOSS] ** : 0.967397 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005416 | Grad Max: 0.171184 -> Layer: shared_layers.0.bias | Grad Mean: 0.228090 | Grad Max: 0.924497 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.007583 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001025 | Grad Max: 0.001025 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001630 | Grad Max: 0.145437 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029466 | Grad Max: 0.805916 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000297 | Grad Max: 0.007714 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013333 | Grad Max: 0.051573 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000507 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003049 | Grad Max: 0.006832 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000224 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000943 | Grad Max: 0.002372 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001336 | Grad Max: 0.003066 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024455 | Grad Max: 0.024455 [GRADIENT NORM TOTAL] 4.4064 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.381 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044405 0.49555948] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.529 | Std: 0.035 [MASKS] A(Pass/Fail): 551/1497 | B: 439/1609 | C: 252/1796 [LOSS Ex1] A: 0.66776 | B: 0.66392 | C: 0.66155 [LOGITS Ex2 A] Mean Abs: 1.614 | Max: 5.687 [LOSS Ex2] A: 0.19442 | B: 0.40874 | C: 0.31345 ** [JOINT LOSS] ** : 0.969948 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.061532 -> Layer: shared_layers.0.bias | Grad Mean: 0.060791 | Grad Max: 0.246718 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001945 | Grad Max: 0.006308 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000185 | Grad Max: 0.000185 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000466 | Grad Max: 0.120164 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007593 | Grad Max: 0.675867 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003978 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001743 | Grad Max: 0.022227 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000140 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000306 | Grad Max: 0.001849 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000083 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000087 | Grad Max: 0.000639 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000385 | Grad Max: 0.001257 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001199 | Grad Max: 0.001199 [GRADIENT NORM TOTAL] 1.6036 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.069 | Max: 0.302 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5263743 0.4736257] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.529 | Std: 0.035 [MASKS] A(Pass/Fail): 561/1487 | B: 444/1604 | C: 282/1766 [LOSS Ex1] A: 0.66743 | B: 0.66447 | C: 0.65869 [LOGITS Ex2 A] Mean Abs: 1.553 | Max: 6.092 [LOSS Ex2] A: 0.19826 | B: 0.40576 | C: 0.33016 ** [JOINT LOSS] ** : 0.974922 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002451 | Grad Max: 0.053337 -> Layer: shared_layers.0.bias | Grad Mean: 0.161242 | Grad Max: 0.721753 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002091 | Grad Max: 0.006824 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007561 | Grad Max: 0.007561 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001119 | Grad Max: 0.167833 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020337 | Grad Max: 0.942593 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000204 | Grad Max: 0.007871 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009370 | Grad Max: 0.050296 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000317 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002005 | Grad Max: 0.005101 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000165 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000599 | Grad Max: 0.001577 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000883 | Grad Max: 0.002238 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015485 | Grad Max: 0.015485 [GRADIENT NORM TOTAL] 3.3499 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.442 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6218216 0.37817842] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.532 | Std: 0.036 [MASKS] A(Pass/Fail): 619/1429 | B: 461/1587 | C: 257/1791 [LOSS Ex1] A: 0.66410 | B: 0.66135 | C: 0.66220 [LOGITS Ex2 A] Mean Abs: 1.633 | Max: 6.429 [LOSS Ex2] A: 0.18171 | B: 0.38327 | C: 0.32229 ** [JOINT LOSS] ** : 0.958309 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003132 | Grad Max: 0.069792 -> Layer: shared_layers.0.bias | Grad Mean: 0.101516 | Grad Max: 0.353820 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002251 | Grad Max: 0.008699 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012697 | Grad Max: 0.012697 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000731 | Grad Max: 0.093672 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013181 | Grad Max: 0.519939 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.005069 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005410 | Grad Max: 0.025186 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000277 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001225 | Grad Max: 0.003226 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000142 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000381 | Grad Max: 0.001323 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000639 | Grad Max: 0.002209 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010846 | Grad Max: 0.010846 [GRADIENT NORM TOTAL] 2.0588 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.495 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50009936 0.49990064] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.036 [MASKS] A(Pass/Fail): 601/1447 | B: 419/1437 | C: 178/1198 [LOSS Ex1] A: 0.66807 | B: 0.66444 | C: 0.66212 [LOGITS Ex2 A] Mean Abs: 1.631 | Max: 5.325 [LOSS Ex2] A: 0.18647 | B: 0.37597 | C: 0.31693 ** [JOINT LOSS] ** : 0.958003 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003253 | Grad Max: 0.112730 -> Layer: shared_layers.0.bias | Grad Mean: 0.050371 | Grad Max: 0.350805 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002000 | Grad Max: 0.007040 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006752 | Grad Max: 0.006752 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000495 | Grad Max: 0.125089 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007715 | Grad Max: 0.632772 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.003169 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001480 | Grad Max: 0.016426 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000185 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000284 | Grad Max: 0.001640 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000084 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000090 | Grad Max: 0.000574 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000510 | Grad Max: 0.001407 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000945 | Grad Max: 0.000945 [GRADIENT NORM TOTAL] 1.4470 [EPOCH SUMMARY] Train Loss: 0.9720 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9436 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9441 -> New: 0.9436) ############################## EPOCH 69/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.072 | Max: 0.295 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.60154873 0.3984513 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.036 [MASKS] A(Pass/Fail): 581/1467 | B: 442/1606 | C: 291/1757 [LOSS Ex1] A: 0.66574 | B: 0.66375 | C: 0.65828 [LOGITS Ex2 A] Mean Abs: 1.617 | Max: 5.802 [LOSS Ex2] A: 0.20468 | B: 0.40841 | C: 0.30875 ** [JOINT LOSS] ** : 0.969872 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003068 | Grad Max: 0.069562 -> Layer: shared_layers.0.bias | Grad Mean: 0.141796 | Grad Max: 0.568703 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.007413 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001472 | Grad Max: 0.001472 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000956 | Grad Max: 0.100141 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017456 | Grad Max: 0.569125 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000177 | Grad Max: 0.006235 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008136 | Grad Max: 0.037396 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000414 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001839 | Grad Max: 0.005095 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000148 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000549 | Grad Max: 0.001521 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000749 | Grad Max: 0.001984 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013213 | Grad Max: 0.013213 [GRADIENT NORM TOTAL] 2.6909 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.372 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5641892 0.4358108] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.531 | Std: 0.036 [MASKS] A(Pass/Fail): 481/1135 | B: 447/1601 | C: 274/1774 [LOSS Ex1] A: 0.66460 | B: 0.66430 | C: 0.65806 [LOGITS Ex2 A] Mean Abs: 1.692 | Max: 5.675 [LOSS Ex2] A: 0.18315 | B: 0.39885 | C: 0.30214 ** [JOINT LOSS] ** : 0.957034 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002459 | Grad Max: 0.076015 -> Layer: shared_layers.0.bias | Grad Mean: 0.201169 | Grad Max: 0.931369 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.007770 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003572 | Grad Max: 0.003572 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001322 | Grad Max: 0.134376 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024518 | Grad Max: 0.751826 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000233 | Grad Max: 0.008725 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010859 | Grad Max: 0.054676 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000421 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002378 | Grad Max: 0.005628 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000184 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000721 | Grad Max: 0.001906 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001014 | Grad Max: 0.002689 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019136 | Grad Max: 0.019136 [GRADIENT NORM TOTAL] 4.1162 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.496 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50601035 0.4939897 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.531 | Std: 0.037 [MASKS] A(Pass/Fail): 604/1444 | B: 462/1586 | C: 315/1733 [LOSS Ex1] A: 0.66477 | B: 0.66116 | C: 0.65495 [LOGITS Ex2 A] Mean Abs: 1.661 | Max: 5.855 [LOSS Ex2] A: 0.19272 | B: 0.38185 | C: 0.29911 ** [JOINT LOSS] ** : 0.951523 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002326 | Grad Max: 0.052424 -> Layer: shared_layers.0.bias | Grad Mean: 0.142668 | Grad Max: 0.613045 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.007056 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001925 | Grad Max: 0.001925 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001005 | Grad Max: 0.118986 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017809 | Grad Max: 0.671348 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000162 | Grad Max: 0.004507 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007414 | Grad Max: 0.030357 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000302 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001676 | Grad Max: 0.004123 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000150 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000518 | Grad Max: 0.001430 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000715 | Grad Max: 0.002483 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013931 | Grad Max: 0.013931 [GRADIENT NORM TOTAL] 2.9808 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.442 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5065246 0.49347535] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.531 | Std: 0.037 [MASKS] A(Pass/Fail): 588/1460 | B: 420/1436 | C: 266/1782 [LOSS Ex1] A: 0.66331 | B: 0.66426 | C: 0.65987 [LOGITS Ex2 A] Mean Abs: 1.639 | Max: 5.798 [LOSS Ex2] A: 0.17872 | B: 0.38583 | C: 0.32499 ** [JOINT LOSS] ** : 0.958995 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002399 | Grad Max: 0.068024 -> Layer: shared_layers.0.bias | Grad Mean: 0.196769 | Grad Max: 0.883930 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002154 | Grad Max: 0.008024 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007063 | Grad Max: 0.007063 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001325 | Grad Max: 0.238899 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024364 | Grad Max: 1.345304 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000247 | Grad Max: 0.009680 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011385 | Grad Max: 0.059641 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000436 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002422 | Grad Max: 0.005755 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000194 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000714 | Grad Max: 0.001894 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001020 | Grad Max: 0.002356 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017523 | Grad Max: 0.017523 [GRADIENT NORM TOTAL] 4.1863 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.455 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015182 0.49848184] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.531 | Std: 0.036 [MASKS] A(Pass/Fail): 593/1455 | B: 442/1606 | C: 288/1760 [LOSS Ex1] A: 0.66235 | B: 0.66356 | C: 0.65676 [LOGITS Ex2 A] Mean Abs: 1.624 | Max: 6.136 [LOSS Ex2] A: 0.21346 | B: 0.41391 | C: 0.32685 ** [JOINT LOSS] ** : 0.978963 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003136 | Grad Max: 0.078773 -> Layer: shared_layers.0.bias | Grad Mean: 0.176172 | Grad Max: 0.839682 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.008179 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005374 | Grad Max: 0.005374 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001279 | Grad Max: 0.275406 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022763 | Grad Max: 1.544173 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000212 | Grad Max: 0.007429 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009743 | Grad Max: 0.048180 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000352 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002043 | Grad Max: 0.004820 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000177 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000604 | Grad Max: 0.001552 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000910 | Grad Max: 0.001886 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015770 | Grad Max: 0.015770 [GRADIENT NORM TOTAL] 4.0155 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.385 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50440884 0.49559116] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.036 [MASKS] A(Pass/Fail): 557/1491 | B: 449/1599 | C: 249/1799 [LOSS Ex1] A: 0.66739 | B: 0.66411 | C: 0.66251 [LOGITS Ex2 A] Mean Abs: 1.643 | Max: 6.058 [LOSS Ex2] A: 0.18531 | B: 0.40608 | C: 0.33488 ** [JOINT LOSS] ** : 0.973429 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002284 | Grad Max: 0.054663 -> Layer: shared_layers.0.bias | Grad Mean: 0.125636 | Grad Max: 0.427099 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.006180 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002634 | Grad Max: 0.002634 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000900 | Grad Max: 0.168024 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016341 | Grad Max: 0.941334 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.003839 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005120 | Grad Max: 0.027220 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000284 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001124 | Grad Max: 0.003509 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000116 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000324 | Grad Max: 0.001091 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000348 | Grad Max: 0.001207 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006622 | Grad Max: 0.006622 [GRADIENT NORM TOTAL] 2.9104 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.070 | Max: 0.308 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5269097 0.47309032] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.530 | Std: 0.035 [MASKS] A(Pass/Fail): 561/1487 | B: 463/1585 | C: 284/1764 [LOSS Ex1] A: 0.66707 | B: 0.66097 | C: 0.65946 [LOGITS Ex2 A] Mean Abs: 1.622 | Max: 5.907 [LOSS Ex2] A: 0.19377 | B: 0.39493 | C: 0.30766 ** [JOINT LOSS] ** : 0.961288 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003058 | Grad Max: 0.097831 -> Layer: shared_layers.0.bias | Grad Mean: 0.116428 | Grad Max: 0.547028 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.007520 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009039 | Grad Max: 0.009039 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000821 | Grad Max: 0.142731 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014695 | Grad Max: 0.752586 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000123 | Grad Max: 0.004341 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005516 | Grad Max: 0.026801 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000261 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001248 | Grad Max: 0.003397 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000363 | Grad Max: 0.001014 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000446 | Grad Max: 0.001721 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007928 | Grad Max: 0.007928 [GRADIENT NORM TOTAL] 2.4951 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.446 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6241763 0.3758237] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.532 | Std: 0.037 [MASKS] A(Pass/Fail): 621/1427 | B: 420/1436 | C: 244/1804 [LOSS Ex1] A: 0.66366 | B: 0.66407 | C: 0.66187 [LOGITS Ex2 A] Mean Abs: 1.658 | Max: 5.891 [LOSS Ex2] A: 0.18326 | B: 0.38552 | C: 0.29274 ** [JOINT LOSS] ** : 0.950376 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.044591 -> Layer: shared_layers.0.bias | Grad Mean: 0.086373 | Grad Max: 0.369956 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.007598 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006914 | Grad Max: 0.006914 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000621 | Grad Max: 0.080407 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011213 | Grad Max: 0.446311 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.004528 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005743 | Grad Max: 0.028925 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000269 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001246 | Grad Max: 0.003431 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000153 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000350 | Grad Max: 0.001096 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000555 | Grad Max: 0.001839 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007449 | Grad Max: 0.007449 [GRADIENT NORM TOTAL] 1.7256 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.499 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50009197 0.49990803] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.531 | Std: 0.036 [MASKS] A(Pass/Fail): 603/1445 | B: 443/1605 | C: 271/1777 [LOSS Ex1] A: 0.66768 | B: 0.66336 | C: 0.65949 [LOGITS Ex2 A] Mean Abs: 1.683 | Max: 5.361 [LOSS Ex2] A: 0.18353 | B: 0.40048 | C: 0.30611 ** [JOINT LOSS] ** : 0.960216 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.060218 -> Layer: shared_layers.0.bias | Grad Mean: 0.127937 | Grad Max: 0.627055 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.006986 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005746 | Grad Max: 0.005746 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000880 | Grad Max: 0.102083 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015926 | Grad Max: 0.571334 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000155 | Grad Max: 0.005865 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007072 | Grad Max: 0.038441 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000351 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001464 | Grad Max: 0.004120 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000125 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000425 | Grad Max: 0.001161 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000506 | Grad Max: 0.001833 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010188 | Grad Max: 0.010188 [GRADIENT NORM TOTAL] 2.5900 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.301 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.60350955 0.39649045] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.531 | Std: 0.037 [MASKS] A(Pass/Fail): 584/1464 | B: 452/1596 | C: 237/1811 [LOSS Ex1] A: 0.66529 | B: 0.66391 | C: 0.66273 [LOGITS Ex2 A] Mean Abs: 1.659 | Max: 5.755 [LOSS Ex2] A: 0.20525 | B: 0.40091 | C: 0.31191 ** [JOINT LOSS] ** : 0.970002 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003177 | Grad Max: 0.099604 -> Layer: shared_layers.0.bias | Grad Mean: 0.065118 | Grad Max: 0.263539 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.007820 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008072 | Grad Max: 0.008072 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000463 | Grad Max: 0.122020 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007699 | Grad Max: 0.674551 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000053 | Grad Max: 0.002912 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001763 | Grad Max: 0.016014 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000182 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000365 | Grad Max: 0.001884 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000083 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000113 | Grad Max: 0.000637 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000363 | Grad Max: 0.001323 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003314 | Grad Max: 0.003314 [GRADIENT NORM TOTAL] 1.5137 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.378 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5655113 0.43448865] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.532 | Std: 0.037 [MASKS] A(Pass/Fail): 481/1135 | B: 463/1585 | C: 271/1777 [LOSS Ex1] A: 0.66413 | B: 0.66075 | C: 0.65757 [LOGITS Ex2 A] Mean Abs: 1.715 | Max: 5.764 [LOSS Ex2] A: 0.18535 | B: 0.38578 | C: 0.31241 ** [JOINT LOSS] ** : 0.955328 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001818 | Grad Max: 0.036117 -> Layer: shared_layers.0.bias | Grad Mean: 0.071461 | Grad Max: 0.306810 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.007718 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000356 | Grad Max: 0.000356 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000533 | Grad Max: 0.108123 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009456 | Grad Max: 0.598181 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.003488 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002846 | Grad Max: 0.021683 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000255 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000651 | Grad Max: 0.002303 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000084 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000206 | Grad Max: 0.000849 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000445 | Grad Max: 0.001549 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006219 | Grad Max: 0.006219 [GRADIENT NORM TOTAL] 1.7418 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.501 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50605017 0.49394986] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.532 | Std: 0.038 [MASKS] A(Pass/Fail): 609/1439 | B: 423/1433 | C: 226/1822 [LOSS Ex1] A: 0.66428 | B: 0.66384 | C: 0.66316 [LOGITS Ex2 A] Mean Abs: 1.721 | Max: 7.003 [LOSS Ex2] A: 0.19068 | B: 0.38991 | C: 0.30749 ** [JOINT LOSS] ** : 0.959787 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002281 | Grad Max: 0.052367 -> Layer: shared_layers.0.bias | Grad Mean: 0.139276 | Grad Max: 0.570654 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.007882 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010767 | Grad Max: 0.010767 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000995 | Grad Max: 0.127268 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018383 | Grad Max: 0.720915 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.006482 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008106 | Grad Max: 0.041901 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000304 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001789 | Grad Max: 0.004156 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000139 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000552 | Grad Max: 0.001368 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000886 | Grad Max: 0.002517 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015339 | Grad Max: 0.015339 [GRADIENT NORM TOTAL] 2.9198 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.448 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068984 0.49310163] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.532 | Std: 0.037 [MASKS] A(Pass/Fail): 591/1457 | B: 443/1605 | C: 249/1799 [LOSS Ex1] A: 0.66277 | B: 0.66312 | C: 0.66081 [LOGITS Ex2 A] Mean Abs: 1.697 | Max: 6.256 [LOSS Ex2] A: 0.19039 | B: 0.40620 | C: 0.33048 ** [JOINT LOSS] ** : 0.971252 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001861 | Grad Max: 0.040721 -> Layer: shared_layers.0.bias | Grad Mean: 0.079661 | Grad Max: 0.462967 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.008109 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008413 | Grad Max: 0.008413 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000570 | Grad Max: 0.118512 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009816 | Grad Max: 0.659728 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.003641 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003189 | Grad Max: 0.018224 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000219 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000675 | Grad Max: 0.002448 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000087 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000204 | Grad Max: 0.000820 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000365 | Grad Max: 0.001246 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005690 | Grad Max: 0.005690 [GRADIENT NORM TOTAL] 1.8965 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.461 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50149864 0.4985014 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.532 | Std: 0.037 [MASKS] A(Pass/Fail): 599/1449 | B: 453/1595 | C: 172/1204 [LOSS Ex1] A: 0.66177 | B: 0.66366 | C: 0.65919 [LOGITS Ex2 A] Mean Abs: 1.695 | Max: 6.210 [LOSS Ex2] A: 0.20926 | B: 0.40087 | C: 0.29374 ** [JOINT LOSS] ** : 0.962831 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004004 | Grad Max: 0.147046 -> Layer: shared_layers.0.bias | Grad Mean: 0.064193 | Grad Max: 0.305326 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.007690 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000822 | Grad Max: 0.000822 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000609 | Grad Max: 0.112459 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009579 | Grad Max: 0.571090 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.002814 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002589 | Grad Max: 0.015524 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000228 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000617 | Grad Max: 0.002328 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000103 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000183 | Grad Max: 0.000695 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000279 | Grad Max: 0.001156 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004307 | Grad Max: 0.004307 [GRADIENT NORM TOTAL] 1.6749 [EPOCH SUMMARY] Train Loss: 0.9629 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9410 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9436 -> New: 0.9410) ############################## EPOCH 70/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.390 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5043737 0.49562627] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.530 | Std: 0.037 [MASKS] A(Pass/Fail): 559/1489 | B: 464/1584 | C: 292/1756 [LOSS Ex1] A: 0.66689 | B: 0.66047 | C: 0.65637 [LOGITS Ex2 A] Mean Abs: 1.679 | Max: 6.629 [LOSS Ex2] A: 0.17926 | B: 0.38056 | C: 0.30113 ** [JOINT LOSS] ** : 0.948226 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002918 | Grad Max: 0.078839 -> Layer: shared_layers.0.bias | Grad Mean: 0.067960 | Grad Max: 0.351085 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.007111 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008004 | Grad Max: 0.008004 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000650 | Grad Max: 0.044341 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011135 | Grad Max: 0.227211 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000119 | Grad Max: 0.005519 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005338 | Grad Max: 0.027655 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000268 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001215 | Grad Max: 0.003433 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000145 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000362 | Grad Max: 0.001286 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000565 | Grad Max: 0.001844 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009011 | Grad Max: 0.009011 [GRADIENT NORM TOTAL] 1.4869 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.071 | Max: 0.314 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52765477 0.47234526] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.530 | Std: 0.036 [MASKS] A(Pass/Fail): 565/1483 | B: 424/1432 | C: 272/1776 [LOSS Ex1] A: 0.66656 | B: 0.66357 | C: 0.65763 [LOGITS Ex2 A] Mean Abs: 1.690 | Max: 6.233 [LOSS Ex2] A: 0.19388 | B: 0.37986 | C: 0.29004 ** [JOINT LOSS] ** : 0.950516 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003438 | Grad Max: 0.078932 -> Layer: shared_layers.0.bias | Grad Mean: 0.170396 | Grad Max: 0.759365 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.007323 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009887 | Grad Max: 0.009887 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001137 | Grad Max: 0.164427 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020954 | Grad Max: 0.861242 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000204 | Grad Max: 0.010716 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009338 | Grad Max: 0.066786 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000397 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002015 | Grad Max: 0.004782 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000169 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000600 | Grad Max: 0.001476 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000833 | Grad Max: 0.002307 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014945 | Grad Max: 0.014945 [GRADIENT NORM TOTAL] 3.4790 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.452 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6273321 0.3726679] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.533 | Std: 0.038 [MASKS] A(Pass/Fail): 628/1420 | B: 445/1603 | C: 261/1787 [LOSS Ex1] A: 0.66304 | B: 0.66284 | C: 0.65839 [LOGITS Ex2 A] Mean Abs: 1.729 | Max: 5.913 [LOSS Ex2] A: 0.18101 | B: 0.40280 | C: 0.31341 ** [JOINT LOSS] ** : 0.960496 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002740 | Grad Max: 0.095527 -> Layer: shared_layers.0.bias | Grad Mean: 0.207622 | Grad Max: 1.017944 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.008349 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012926 | Grad Max: 0.012926 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001389 | Grad Max: 0.126448 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025657 | Grad Max: 0.703658 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000259 | Grad Max: 0.010623 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012085 | Grad Max: 0.073997 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000426 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002586 | Grad Max: 0.005886 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000191 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000768 | Grad Max: 0.001800 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001132 | Grad Max: 0.002497 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020261 | Grad Max: 0.020261 [GRADIENT NORM TOTAL] 4.1462 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.507 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000887 0.4999113] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.532 | Std: 0.037 [MASKS] A(Pass/Fail): 606/1442 | B: 487/1561 | C: 268/1780 [LOSS Ex1] A: 0.66714 | B: 0.66338 | C: 0.65932 [LOGITS Ex2 A] Mean Abs: 1.697 | Max: 5.839 [LOSS Ex2] A: 0.17968 | B: 0.40114 | C: 0.33524 ** [JOINT LOSS] ** : 0.968634 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007740 | Grad Max: 0.260775 -> Layer: shared_layers.0.bias | Grad Mean: 0.242399 | Grad Max: 1.014084 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002011 | Grad Max: 0.006520 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001608 | Grad Max: 0.001608 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001703 | Grad Max: 0.251326 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030812 | Grad Max: 1.406180 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000311 | Grad Max: 0.007571 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014085 | Grad Max: 0.051802 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000569 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003228 | Grad Max: 0.007086 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000275 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000956 | Grad Max: 0.002310 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001388 | Grad Max: 0.002579 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024222 | Grad Max: 0.024222 [GRADIENT NORM TOTAL] 4.7317 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.308 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6060719 0.39392814] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.532 | Std: 0.038 [MASKS] A(Pass/Fail): 588/1460 | B: 489/1559 | C: 252/1796 [LOSS Ex1] A: 0.66469 | B: 0.66019 | C: 0.65920 [LOGITS Ex2 A] Mean Abs: 1.686 | Max: 6.057 [LOSS Ex2] A: 0.19892 | B: 0.38493 | C: 0.32550 ** [JOINT LOSS] ** : 0.964477 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005823 | Grad Max: 0.162444 -> Layer: shared_layers.0.bias | Grad Mean: 0.238411 | Grad Max: 0.902657 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.007316 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001512 | Grad Max: 0.001512 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001579 | Grad Max: 0.253420 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029130 | Grad Max: 1.427959 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000289 | Grad Max: 0.009272 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013204 | Grad Max: 0.053885 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000493 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002959 | Grad Max: 0.006586 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000244 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000864 | Grad Max: 0.002160 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001343 | Grad Max: 0.002548 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022201 | Grad Max: 0.022201 [GRADIENT NORM TOTAL] 4.7101 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.385 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.56710505 0.43289497] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.533 | Std: 0.038 [MASKS] A(Pass/Fail): 485/1131 | B: 453/1403 | C: 241/1807 [LOSS Ex1] A: 0.66351 | B: 0.66331 | C: 0.66127 [LOGITS Ex2 A] Mean Abs: 1.758 | Max: 6.033 [LOSS Ex2] A: 0.19131 | B: 0.38165 | C: 0.32537 ** [JOINT LOSS] ** : 0.962138 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003139 | Grad Max: 0.067808 -> Layer: shared_layers.0.bias | Grad Mean: 0.129168 | Grad Max: 0.645741 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.007744 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006565 | Grad Max: 0.006565 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001040 | Grad Max: 0.134604 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019066 | Grad Max: 0.752246 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000191 | Grad Max: 0.008024 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008707 | Grad Max: 0.050116 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000350 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001878 | Grad Max: 0.004642 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000169 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000550 | Grad Max: 0.001493 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000836 | Grad Max: 0.001996 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014405 | Grad Max: 0.014405 [GRADIENT NORM TOTAL] 2.7877 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.509 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061049 0.4938951] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.533 | Std: 0.038 [MASKS] A(Pass/Fail): 613/1435 | B: 472/1576 | C: 265/1783 [LOSS Ex1] A: 0.66369 | B: 0.66258 | C: 0.65881 [LOGITS Ex2 A] Mean Abs: 1.738 | Max: 6.548 [LOSS Ex2] A: 0.18417 | B: 0.40382 | C: 0.29913 ** [JOINT LOSS] ** : 0.957400 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003155 | Grad Max: 0.055287 -> Layer: shared_layers.0.bias | Grad Mean: 0.060343 | Grad Max: 0.306848 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.007925 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009516 | Grad Max: 0.009516 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000628 | Grad Max: 0.154057 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010366 | Grad Max: 0.868654 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000094 | Grad Max: 0.005125 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003927 | Grad Max: 0.023539 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000239 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000877 | Grad Max: 0.002625 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000115 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000270 | Grad Max: 0.000857 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000433 | Grad Max: 0.001613 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007591 | Grad Max: 0.007591 [GRADIENT NORM TOTAL] 1.7130 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.455 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50720555 0.4927944 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.532 | Std: 0.038 [MASKS] A(Pass/Fail): 595/1453 | B: 487/1561 | C: 287/1761 [LOSS Ex1] A: 0.66214 | B: 0.66313 | C: 0.65830 [LOGITS Ex2 A] Mean Abs: 1.692 | Max: 6.190 [LOSS Ex2] A: 0.18474 | B: 0.40069 | C: 0.33228 ** [JOINT LOSS] ** : 0.967092 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002656 | Grad Max: 0.092177 -> Layer: shared_layers.0.bias | Grad Mean: 0.251638 | Grad Max: 1.127915 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.007541 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003154 | Grad Max: 0.003154 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001608 | Grad Max: 0.162825 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030136 | Grad Max: 0.916949 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000315 | Grad Max: 0.010883 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014812 | Grad Max: 0.073077 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000494 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003214 | Grad Max: 0.007290 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000269 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000945 | Grad Max: 0.002446 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001374 | Grad Max: 0.002681 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024366 | Grad Max: 0.024366 [GRADIENT NORM TOTAL] 4.9181 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.469 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015681 0.49843186] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.532 | Std: 0.038 [MASKS] A(Pass/Fail): 602/1446 | B: 491/1557 | C: 290/1758 [LOSS Ex1] A: 0.66114 | B: 0.65993 | C: 0.65597 [LOGITS Ex2 A] Mean Abs: 1.694 | Max: 7.182 [LOSS Ex2] A: 0.20706 | B: 0.38104 | C: 0.29480 ** [JOINT LOSS] ** : 0.953313 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003254 | Grad Max: 0.098724 -> Layer: shared_layers.0.bias | Grad Mean: 0.200522 | Grad Max: 0.863219 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002289 | Grad Max: 0.007636 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001362 | Grad Max: 0.001362 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001310 | Grad Max: 0.154537 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023073 | Grad Max: 0.844493 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.007155 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010249 | Grad Max: 0.049124 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000407 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002101 | Grad Max: 0.004872 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000166 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000590 | Grad Max: 0.001674 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000750 | Grad Max: 0.001983 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013691 | Grad Max: 0.013691 [GRADIENT NORM TOTAL] 3.8680 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.396 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044213 0.49557874] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.531 | Std: 0.038 [MASKS] A(Pass/Fail): 562/1486 | B: 455/1401 | C: 249/1799 [LOSS Ex1] A: 0.66636 | B: 0.66305 | C: 0.66188 [LOGITS Ex2 A] Mean Abs: 1.704 | Max: 5.856 [LOSS Ex2] A: 0.18009 | B: 0.37376 | C: 0.32381 ** [JOINT LOSS] ** : 0.956318 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004197 | Grad Max: 0.119586 -> Layer: shared_layers.0.bias | Grad Mean: 0.198567 | Grad Max: 0.881123 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001901 | Grad Max: 0.006310 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001646 | Grad Max: 0.001646 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001365 | Grad Max: 0.153054 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025096 | Grad Max: 0.843200 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000246 | Grad Max: 0.008104 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011315 | Grad Max: 0.055706 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000419 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002492 | Grad Max: 0.006245 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000184 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000726 | Grad Max: 0.001838 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000949 | Grad Max: 0.002402 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017794 | Grad Max: 0.017794 [GRADIENT NORM TOTAL] 4.0633 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.073 | Max: 0.321 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5282064 0.47179353] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.531 | Std: 0.037 [MASKS] A(Pass/Fail): 567/1481 | B: 473/1575 | C: 270/1778 [LOSS Ex1] A: 0.66606 | B: 0.66233 | C: 0.65788 [LOGITS Ex2 A] Mean Abs: 1.673 | Max: 6.053 [LOSS Ex2] A: 0.19550 | B: 0.40913 | C: 0.30047 ** [JOINT LOSS] ** : 0.963790 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003996 | Grad Max: 0.096086 -> Layer: shared_layers.0.bias | Grad Mean: 0.236810 | Grad Max: 1.107232 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.007537 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004783 | Grad Max: 0.004783 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001520 | Grad Max: 0.161993 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028304 | Grad Max: 0.908366 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.009110 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014084 | Grad Max: 0.063130 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000540 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003067 | Grad Max: 0.007242 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000228 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000895 | Grad Max: 0.002331 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001205 | Grad Max: 0.002569 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022322 | Grad Max: 0.022322 [GRADIENT NORM TOTAL] 4.5401 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.457 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63011426 0.36988568] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.534 | Std: 0.039 [MASKS] A(Pass/Fail): 635/1413 | B: 489/1559 | C: 273/1775 [LOSS Ex1] A: 0.66249 | B: 0.66289 | C: 0.65743 [LOGITS Ex2 A] Mean Abs: 1.697 | Max: 6.326 [LOSS Ex2] A: 0.18284 | B: 0.39514 | C: 0.30452 ** [JOINT LOSS] ** : 0.955105 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002837 | Grad Max: 0.057319 -> Layer: shared_layers.0.bias | Grad Mean: 0.126809 | Grad Max: 0.539896 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.007788 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008628 | Grad Max: 0.008628 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000938 | Grad Max: 0.146875 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017268 | Grad Max: 0.819746 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.005798 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007896 | Grad Max: 0.036553 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000300 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001712 | Grad Max: 0.004438 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000153 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000488 | Grad Max: 0.001378 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000696 | Grad Max: 0.001833 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012042 | Grad Max: 0.012042 [GRADIENT NORM TOTAL] 2.6572 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.512 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000924 0.49990755] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.532 | Std: 0.038 [MASKS] A(Pass/Fail): 609/1439 | B: 493/1555 | C: 249/1799 [LOSS Ex1] A: 0.66668 | B: 0.65970 | C: 0.65968 [LOGITS Ex2 A] Mean Abs: 1.709 | Max: 6.132 [LOSS Ex2] A: 0.17856 | B: 0.38129 | C: 0.30454 ** [JOINT LOSS] ** : 0.950150 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005037 | Grad Max: 0.159361 -> Layer: shared_layers.0.bias | Grad Mean: 0.087559 | Grad Max: 0.416031 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.006747 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004422 | Grad Max: 0.004422 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000871 | Grad Max: 0.149495 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014876 | Grad Max: 0.808670 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.003786 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005471 | Grad Max: 0.024383 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000255 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001317 | Grad Max: 0.003639 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000127 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000388 | Grad Max: 0.001144 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000613 | Grad Max: 0.001867 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009510 | Grad Max: 0.009510 [GRADIENT NORM TOTAL] 2.2826 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.314 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.60823715 0.39176285] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.532 | Std: 0.039 [MASKS] A(Pass/Fail): 594/1454 | B: 455/1401 | C: 178/1198 [LOSS Ex1] A: 0.66420 | B: 0.66282 | C: 0.65918 [LOGITS Ex2 A] Mean Abs: 1.731 | Max: 5.938 [LOSS Ex2] A: 0.19781 | B: 0.37848 | C: 0.31893 ** [JOINT LOSS] ** : 0.960474 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003021 | Grad Max: 0.069912 -> Layer: shared_layers.0.bias | Grad Mean: 0.192755 | Grad Max: 0.741550 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.007232 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002070 | Grad Max: 0.002070 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001299 | Grad Max: 0.188306 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023559 | Grad Max: 1.060855 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000235 | Grad Max: 0.008622 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010940 | Grad Max: 0.055225 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000504 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002326 | Grad Max: 0.005682 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000202 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000677 | Grad Max: 0.001841 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001003 | Grad Max: 0.002627 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017731 | Grad Max: 0.017731 [GRADIENT NORM TOTAL] 4.0067 [EPOCH SUMMARY] Train Loss: 0.9584 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9376 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9410 -> New: 0.9376) ############################## EPOCH 71/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.391 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5684959 0.43150407] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.533 | Std: 0.039 [MASKS] A(Pass/Fail): 490/1126 | B: 474/1574 | C: 272/1776 [LOSS Ex1] A: 0.66302 | B: 0.66211 | C: 0.65677 [LOGITS Ex2 A] Mean Abs: 1.758 | Max: 5.598 [LOSS Ex2] A: 0.17072 | B: 0.40672 | C: 0.31028 ** [JOINT LOSS] ** : 0.956538 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002948 | Grad Max: 0.077437 -> Layer: shared_layers.0.bias | Grad Mean: 0.126719 | Grad Max: 0.547189 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.007205 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007358 | Grad Max: 0.007358 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000819 | Grad Max: 0.103437 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014646 | Grad Max: 0.543806 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.006398 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005729 | Grad Max: 0.032252 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000269 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001228 | Grad Max: 0.003534 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000113 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000347 | Grad Max: 0.001048 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000370 | Grad Max: 0.001343 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007321 | Grad Max: 0.007321 [GRADIENT NORM TOTAL] 2.4164 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.514 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50609773 0.4939023 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.533 | Std: 0.039 [MASKS] A(Pass/Fail): 618/1430 | B: 490/1558 | C: 278/1770 [LOSS Ex1] A: 0.66321 | B: 0.66269 | C: 0.65839 [LOGITS Ex2 A] Mean Abs: 1.721 | Max: 7.062 [LOSS Ex2] A: 0.18244 | B: 0.38814 | C: 0.30608 ** [JOINT LOSS] ** : 0.953650 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002368 | Grad Max: 0.080588 -> Layer: shared_layers.0.bias | Grad Mean: 0.193622 | Grad Max: 0.959441 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.006796 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000019 | Grad Max: 0.000019 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001298 | Grad Max: 0.180900 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024244 | Grad Max: 1.029200 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000258 | Grad Max: 0.008419 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012174 | Grad Max: 0.055366 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000456 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002642 | Grad Max: 0.006510 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000197 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000764 | Grad Max: 0.001812 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001081 | Grad Max: 0.002270 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019363 | Grad Max: 0.019363 [GRADIENT NORM TOTAL] 3.9649 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.461 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50750947 0.49249056] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.533 | Std: 0.039 [MASKS] A(Pass/Fail): 602/1446 | B: 494/1554 | C: 255/1793 [LOSS Ex1] A: 0.66162 | B: 0.65948 | C: 0.65971 [LOGITS Ex2 A] Mean Abs: 1.724 | Max: 5.995 [LOSS Ex2] A: 0.18287 | B: 0.38848 | C: 0.29001 ** [JOINT LOSS] ** : 0.947391 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002950 | Grad Max: 0.073082 -> Layer: shared_layers.0.bias | Grad Mean: 0.124668 | Grad Max: 0.473546 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002270 | Grad Max: 0.008046 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009028 | Grad Max: 0.009028 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000913 | Grad Max: 0.148651 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015776 | Grad Max: 0.821628 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.006875 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006515 | Grad Max: 0.035226 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000299 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001315 | Grad Max: 0.003679 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000128 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000376 | Grad Max: 0.001186 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000554 | Grad Max: 0.001518 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009039 | Grad Max: 0.009039 [GRADIENT NORM TOTAL] 2.6455 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.475 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50156957 0.49843037] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.533 | Std: 0.039 [MASKS] A(Pass/Fail): 609/1439 | B: 460/1396 | C: 282/1766 [LOSS Ex1] A: 0.66062 | B: 0.66261 | C: 0.65618 [LOGITS Ex2 A] Mean Abs: 1.746 | Max: 5.998 [LOSS Ex2] A: 0.20316 | B: 0.38150 | C: 0.33110 ** [JOINT LOSS] ** : 0.965058 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005331 | Grad Max: 0.182884 -> Layer: shared_layers.0.bias | Grad Mean: 0.203688 | Grad Max: 0.926736 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.008616 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009103 | Grad Max: 0.009103 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001572 | Grad Max: 0.193502 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028530 | Grad Max: 1.077740 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000276 | Grad Max: 0.008354 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012755 | Grad Max: 0.058944 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000516 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002854 | Grad Max: 0.006953 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000216 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000840 | Grad Max: 0.002071 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001224 | Grad Max: 0.002348 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020865 | Grad Max: 0.020865 [GRADIENT NORM TOTAL] 4.3424 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.401 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5043908 0.49560922] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.532 | Std: 0.038 [MASKS] A(Pass/Fail): 568/1480 | B: 477/1571 | C: 268/1780 [LOSS Ex1] A: 0.66594 | B: 0.66190 | C: 0.65758 [LOGITS Ex2 A] Mean Abs: 1.723 | Max: 6.039 [LOSS Ex2] A: 0.17875 | B: 0.40477 | C: 0.31786 ** [JOINT LOSS] ** : 0.962270 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004370 | Grad Max: 0.111008 -> Layer: shared_layers.0.bias | Grad Mean: 0.216595 | Grad Max: 1.080723 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002038 | Grad Max: 0.006217 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003124 | Grad Max: 0.003124 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001468 | Grad Max: 0.156772 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027254 | Grad Max: 0.878531 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000274 | Grad Max: 0.008442 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012685 | Grad Max: 0.062099 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000454 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002757 | Grad Max: 0.006373 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000203 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000802 | Grad Max: 0.002019 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001078 | Grad Max: 0.002463 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019793 | Grad Max: 0.019793 [GRADIENT NORM TOTAL] 4.2216 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.074 | Max: 0.326 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5287489 0.4712511] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.531 | Std: 0.037 [MASKS] A(Pass/Fail): 570/1478 | B: 492/1556 | C: 274/1774 [LOSS Ex1] A: 0.66565 | B: 0.66249 | C: 0.65802 [LOGITS Ex2 A] Mean Abs: 1.671 | Max: 6.066 [LOSS Ex2] A: 0.19307 | B: 0.40170 | C: 0.29462 ** [JOINT LOSS] ** : 0.958515 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004848 | Grad Max: 0.134589 -> Layer: shared_layers.0.bias | Grad Mean: 0.232382 | Grad Max: 0.852169 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001993 | Grad Max: 0.006965 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000050 | Grad Max: 0.000050 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001499 | Grad Max: 0.200420 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027839 | Grad Max: 1.124529 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000287 | Grad Max: 0.009481 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013288 | Grad Max: 0.061274 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000511 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002926 | Grad Max: 0.006852 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000213 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000852 | Grad Max: 0.002024 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001317 | Grad Max: 0.002523 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021671 | Grad Max: 0.021671 [GRADIENT NORM TOTAL] 4.4608 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.463 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63251436 0.36748567] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.534 | Std: 0.039 [MASKS] A(Pass/Fail): 640/1408 | B: 497/1551 | C: 256/1792 [LOSS Ex1] A: 0.66202 | B: 0.65927 | C: 0.65986 [LOGITS Ex2 A] Mean Abs: 1.722 | Max: 6.390 [LOSS Ex2] A: 0.17889 | B: 0.37970 | C: 0.32848 ** [JOINT LOSS] ** : 0.956074 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004805 | Grad Max: 0.145682 -> Layer: shared_layers.0.bias | Grad Mean: 0.205079 | Grad Max: 0.761327 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.007662 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005794 | Grad Max: 0.005794 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001358 | Grad Max: 0.169091 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025037 | Grad Max: 0.930979 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000254 | Grad Max: 0.007886 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011757 | Grad Max: 0.060057 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000406 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002630 | Grad Max: 0.006283 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000211 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000774 | Grad Max: 0.001899 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001103 | Grad Max: 0.002144 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019510 | Grad Max: 0.019510 [GRADIENT NORM TOTAL] 3.8819 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.519 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000806 0.4999194] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.533 | Std: 0.039 [MASKS] A(Pass/Fail): 614/1434 | B: 462/1394 | C: 275/1773 [LOSS Ex1] A: 0.66628 | B: 0.66243 | C: 0.65572 [LOGITS Ex2 A] Mean Abs: 1.746 | Max: 5.956 [LOSS Ex2] A: 0.18064 | B: 0.37283 | C: 0.31274 ** [JOINT LOSS] ** : 0.950210 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004300 | Grad Max: 0.146307 -> Layer: shared_layers.0.bias | Grad Mean: 0.155161 | Grad Max: 0.647833 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.006634 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001715 | Grad Max: 0.001715 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001182 | Grad Max: 0.123889 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020247 | Grad Max: 0.672061 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.008856 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008321 | Grad Max: 0.051305 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000330 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001673 | Grad Max: 0.004314 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000148 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000478 | Grad Max: 0.001268 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000644 | Grad Max: 0.002018 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012354 | Grad Max: 0.012354 [GRADIENT NORM TOTAL] 3.2755 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.320 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61003166 0.3899683 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.533 | Std: 0.039 [MASKS] A(Pass/Fail): 597/1451 | B: 481/1567 | C: 258/1790 [LOSS Ex1] A: 0.66378 | B: 0.66172 | C: 0.65805 [LOGITS Ex2 A] Mean Abs: 1.745 | Max: 6.234 [LOSS Ex2] A: 0.19573 | B: 0.40121 | C: 0.28849 ** [JOINT LOSS] ** : 0.956329 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002592 | Grad Max: 0.077092 -> Layer: shared_layers.0.bias | Grad Mean: 0.205274 | Grad Max: 0.934262 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.007365 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006182 | Grad Max: 0.006182 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001358 | Grad Max: 0.145331 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025044 | Grad Max: 0.806665 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000258 | Grad Max: 0.008441 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012096 | Grad Max: 0.052618 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000481 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002574 | Grad Max: 0.006289 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000187 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000752 | Grad Max: 0.001889 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001110 | Grad Max: 0.002600 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019817 | Grad Max: 0.019817 [GRADIENT NORM TOTAL] 4.1253 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.396 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5696789 0.4303211] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.534 | Std: 0.039 [MASKS] A(Pass/Fail): 494/1122 | B: 495/1553 | C: 265/1783 [LOSS Ex1] A: 0.66260 | B: 0.66231 | C: 0.65860 [LOGITS Ex2 A] Mean Abs: 1.755 | Max: 6.541 [LOSS Ex2] A: 0.18216 | B: 0.40031 | C: 0.32481 ** [JOINT LOSS] ** : 0.963599 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002757 | Grad Max: 0.078040 -> Layer: shared_layers.0.bias | Grad Mean: 0.122765 | Grad Max: 0.537053 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.007122 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000618 | Grad Max: 0.000618 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000808 | Grad Max: 0.139410 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014633 | Grad Max: 0.761982 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.005446 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006624 | Grad Max: 0.038472 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000258 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001446 | Grad Max: 0.003615 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000125 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000434 | Grad Max: 0.001233 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000653 | Grad Max: 0.001350 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012073 | Grad Max: 0.012073 [GRADIENT NORM TOTAL] 2.4836 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.521 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061299 0.49387008] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.534 | Std: 0.040 [MASKS] A(Pass/Fail): 625/1423 | B: 498/1550 | C: 291/1757 [LOSS Ex1] A: 0.66279 | B: 0.65907 | C: 0.65665 [LOGITS Ex2 A] Mean Abs: 1.725 | Max: 5.367 [LOSS Ex2] A: 0.18690 | B: 0.38356 | C: 0.30666 ** [JOINT LOSS] ** : 0.951875 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002328 | Grad Max: 0.053462 -> Layer: shared_layers.0.bias | Grad Mean: 0.108052 | Grad Max: 0.452071 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002170 | Grad Max: 0.006830 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000135 | Grad Max: 0.000135 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000764 | Grad Max: 0.293678 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013540 | Grad Max: 1.652915 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000118 | Grad Max: 0.006123 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005463 | Grad Max: 0.039911 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000300 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001136 | Grad Max: 0.003761 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000119 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000322 | Grad Max: 0.001177 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000437 | Grad Max: 0.001468 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007881 | Grad Max: 0.007881 [GRADIENT NORM TOTAL] 2.8317 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.467 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50776315 0.4922368 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.534 | Std: 0.040 [MASKS] A(Pass/Fail): 605/1443 | B: 463/1393 | C: 267/1781 [LOSS Ex1] A: 0.66118 | B: 0.66224 | C: 0.65797 [LOGITS Ex2 A] Mean Abs: 1.757 | Max: 6.782 [LOSS Ex2] A: 0.17910 | B: 0.37239 | C: 0.29579 ** [JOINT LOSS] ** : 0.942888 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003565 | Grad Max: 0.102970 -> Layer: shared_layers.0.bias | Grad Mean: 0.134763 | Grad Max: 0.486020 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.008179 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010279 | Grad Max: 0.010279 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001006 | Grad Max: 0.142499 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018174 | Grad Max: 0.736281 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.005273 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007509 | Grad Max: 0.034743 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000364 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001685 | Grad Max: 0.004300 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000150 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000506 | Grad Max: 0.001322 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000819 | Grad Max: 0.002430 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014140 | Grad Max: 0.014140 [GRADIENT NORM TOTAL] 2.9474 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.482 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50157166 0.49842837] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.534 | Std: 0.039 [MASKS] A(Pass/Fail): 614/1434 | B: 484/1564 | C: 256/1792 [LOSS Ex1] A: 0.66018 | B: 0.66153 | C: 0.65751 [LOGITS Ex2 A] Mean Abs: 1.719 | Max: 6.788 [LOSS Ex2] A: 0.19779 | B: 0.40387 | C: 0.30647 ** [JOINT LOSS] ** : 0.962449 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002823 | Grad Max: 0.120234 -> Layer: shared_layers.0.bias | Grad Mean: 0.068272 | Grad Max: 0.278382 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.008125 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004683 | Grad Max: 0.004683 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000638 | Grad Max: 0.186946 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010548 | Grad Max: 1.043570 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.003982 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003157 | Grad Max: 0.019583 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000248 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000775 | Grad Max: 0.002606 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000091 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000237 | Grad Max: 0.000703 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000346 | Grad Max: 0.001267 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006214 | Grad Max: 0.006214 [GRADIENT NORM TOTAL] 1.9948 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.405 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50437236 0.4956276 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.532 | Std: 0.039 [MASKS] A(Pass/Fail): 576/1472 | B: 498/1550 | C: 186/1190 [LOSS Ex1] A: 0.66558 | B: 0.66212 | C: 0.65731 [LOGITS Ex2 A] Mean Abs: 1.682 | Max: 5.706 [LOSS Ex2] A: 0.17459 | B: 0.40493 | C: 0.32830 ** [JOINT LOSS] ** : 0.964279 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004864 | Grad Max: 0.141049 -> Layer: shared_layers.0.bias | Grad Mean: 0.215183 | Grad Max: 0.882134 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.006086 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002387 | Grad Max: 0.002387 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001568 | Grad Max: 0.232661 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029392 | Grad Max: 1.294034 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000289 | Grad Max: 0.008021 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013425 | Grad Max: 0.054880 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000499 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002937 | Grad Max: 0.006658 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000233 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000857 | Grad Max: 0.002196 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001273 | Grad Max: 0.002454 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021996 | Grad Max: 0.021996 [GRADIENT NORM TOTAL] 4.5733 [EPOCH SUMMARY] Train Loss: 0.9565 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9360 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9376 -> New: 0.9360) ############################## EPOCH 72/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.075 | Max: 0.331 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5292787 0.47072127] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.532 | Std: 0.038 [MASKS] A(Pass/Fail): 582/1466 | B: 499/1549 | C: 298/1750 [LOSS Ex1] A: 0.66529 | B: 0.65887 | C: 0.65679 [LOGITS Ex2 A] Mean Abs: 1.673 | Max: 6.934 [LOSS Ex2] A: 0.18471 | B: 0.38429 | C: 0.31148 ** [JOINT LOSS] ** : 0.953812 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003323 | Grad Max: 0.108317 -> Layer: shared_layers.0.bias | Grad Mean: 0.108771 | Grad Max: 0.374110 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.006779 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004990 | Grad Max: 0.004990 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000843 | Grad Max: 0.165471 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015168 | Grad Max: 0.928244 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000128 | Grad Max: 0.004662 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005712 | Grad Max: 0.027641 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000257 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001287 | Grad Max: 0.003712 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000113 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000377 | Grad Max: 0.001033 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000591 | Grad Max: 0.001613 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009406 | Grad Max: 0.009406 [GRADIENT NORM TOTAL] 2.5910 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.468 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6347861 0.36521387] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.040 [MASKS] A(Pass/Fail): 652/1396 | B: 465/1391 | C: 283/1765 [LOSS Ex1] A: 0.66160 | B: 0.66204 | C: 0.65597 [LOGITS Ex2 A] Mean Abs: 1.739 | Max: 6.632 [LOSS Ex2] A: 0.17872 | B: 0.37605 | C: 0.31157 ** [JOINT LOSS] ** : 0.948652 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003652 | Grad Max: 0.101862 -> Layer: shared_layers.0.bias | Grad Mean: 0.281203 | Grad Max: 1.253438 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.008291 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012997 | Grad Max: 0.012997 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001759 | Grad Max: 0.224583 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032815 | Grad Max: 1.241070 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000350 | Grad Max: 0.010870 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016692 | Grad Max: 0.079055 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000562 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003638 | Grad Max: 0.008055 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000232 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001060 | Grad Max: 0.002418 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001443 | Grad Max: 0.003467 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026605 | Grad Max: 0.026605 [GRADIENT NORM TOTAL] 5.4370 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.524 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50006914 0.49993086] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.533 | Std: 0.039 [MASKS] A(Pass/Fail): 623/1425 | B: 487/1561 | C: 251/1797 [LOSS Ex1] A: 0.66593 | B: 0.66133 | C: 0.65937 [LOGITS Ex2 A] Mean Abs: 1.743 | Max: 5.347 [LOSS Ex2] A: 0.18063 | B: 0.40770 | C: 0.32093 ** [JOINT LOSS] ** : 0.965297 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003117 | Grad Max: 0.109055 -> Layer: shared_layers.0.bias | Grad Mean: 0.298000 | Grad Max: 1.278504 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.006895 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006000 | Grad Max: 0.006000 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001894 | Grad Max: 0.232702 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035168 | Grad Max: 1.305956 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000362 | Grad Max: 0.011738 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017238 | Grad Max: 0.086764 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000579 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003623 | Grad Max: 0.008446 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000260 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001045 | Grad Max: 0.002575 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001397 | Grad Max: 0.002905 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026243 | Grad Max: 0.026243 [GRADIENT NORM TOTAL] 5.9075 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.325 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6118684 0.38813165] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.533 | Std: 0.040 [MASKS] A(Pass/Fail): 604/1444 | B: 502/1546 | C: 286/1762 [LOSS Ex1] A: 0.66340 | B: 0.66193 | C: 0.65629 [LOGITS Ex2 A] Mean Abs: 1.706 | Max: 6.380 [LOSS Ex2] A: 0.19727 | B: 0.40313 | C: 0.30234 ** [JOINT LOSS] ** : 0.961454 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003115 | Grad Max: 0.081051 -> Layer: shared_layers.0.bias | Grad Mean: 0.110506 | Grad Max: 0.522000 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.006522 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003347 | Grad Max: 0.003347 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000808 | Grad Max: 0.103885 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014349 | Grad Max: 0.533061 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000153 | Grad Max: 0.005025 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007008 | Grad Max: 0.028537 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000262 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001572 | Grad Max: 0.003862 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000132 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000476 | Grad Max: 0.001305 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000796 | Grad Max: 0.001794 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012953 | Grad Max: 0.012953 [GRADIENT NORM TOTAL] 2.1273 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.401 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5709001 0.42909992] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.040 [MASKS] A(Pass/Fail): 499/1117 | B: 500/1548 | C: 261/1787 [LOSS Ex1] A: 0.66220 | B: 0.65866 | C: 0.65906 [LOGITS Ex2 A] Mean Abs: 1.751 | Max: 6.624 [LOSS Ex2] A: 0.18149 | B: 0.38279 | C: 0.32900 ** [JOINT LOSS] ** : 0.957736 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001665 | Grad Max: 0.029053 -> Layer: shared_layers.0.bias | Grad Mean: 0.060860 | Grad Max: 0.246321 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.006807 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004908 | Grad Max: 0.004908 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000474 | Grad Max: 0.082093 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008093 | Grad Max: 0.460030 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.004107 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003131 | Grad Max: 0.019694 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000219 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000665 | Grad Max: 0.002533 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000090 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000198 | Grad Max: 0.000746 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000273 | Grad Max: 0.001068 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004834 | Grad Max: 0.004834 [GRADIENT NORM TOTAL] 1.3217 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.526 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061419 0.4938581] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.534 | Std: 0.040 [MASKS] A(Pass/Fail): 631/1417 | B: 468/1388 | C: 263/1785 [LOSS Ex1] A: 0.66239 | B: 0.66184 | C: 0.65779 [LOGITS Ex2 A] Mean Abs: 1.742 | Max: 6.678 [LOSS Ex2] A: 0.19240 | B: 0.37873 | C: 0.29137 ** [JOINT LOSS] ** : 0.948169 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003673 | Grad Max: 0.083851 -> Layer: shared_layers.0.bias | Grad Mean: 0.225707 | Grad Max: 0.945202 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.007181 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002302 | Grad Max: 0.002302 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001537 | Grad Max: 0.179279 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028803 | Grad Max: 0.991492 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.009628 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013077 | Grad Max: 0.066581 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000426 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002866 | Grad Max: 0.006463 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000217 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000857 | Grad Max: 0.002125 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001287 | Grad Max: 0.003387 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023201 | Grad Max: 0.023201 [GRADIENT NORM TOTAL] 4.5597 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.473 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5079855 0.49201453] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.534 | Std: 0.040 [MASKS] A(Pass/Fail): 607/1441 | B: 488/1560 | C: 250/1798 [LOSS Ex1] A: 0.66073 | B: 0.66113 | C: 0.65784 [LOGITS Ex2 A] Mean Abs: 1.736 | Max: 5.332 [LOSS Ex2] A: 0.18876 | B: 0.40695 | C: 0.27634 ** [JOINT LOSS] ** : 0.950583 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004767 | Grad Max: 0.116188 -> Layer: shared_layers.0.bias | Grad Mean: 0.246410 | Grad Max: 1.028237 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002258 | Grad Max: 0.008035 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011713 | Grad Max: 0.011713 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001679 | Grad Max: 0.156089 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031325 | Grad Max: 0.861045 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000311 | Grad Max: 0.010065 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014545 | Grad Max: 0.070290 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000554 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003175 | Grad Max: 0.007585 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000226 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000925 | Grad Max: 0.002259 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001254 | Grad Max: 0.002967 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023159 | Grad Max: 0.023159 [GRADIENT NORM TOTAL] 4.8239 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.488 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.501587 0.49841306] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.534 | Std: 0.040 [MASKS] A(Pass/Fail): 617/1431 | B: 504/1544 | C: 260/1788 [LOSS Ex1] A: 0.65972 | B: 0.66173 | C: 0.65605 [LOGITS Ex2 A] Mean Abs: 1.697 | Max: 6.129 [LOSS Ex2] A: 0.19386 | B: 0.40194 | C: 0.30604 ** [JOINT LOSS] ** : 0.959780 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003124 | Grad Max: 0.062710 -> Layer: shared_layers.0.bias | Grad Mean: 0.195525 | Grad Max: 0.841520 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.007510 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001212 | Grad Max: 0.001212 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001248 | Grad Max: 0.176952 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022521 | Grad Max: 0.978417 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.008388 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010487 | Grad Max: 0.058088 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000381 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002203 | Grad Max: 0.004873 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000155 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000618 | Grad Max: 0.001456 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000842 | Grad Max: 0.001970 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015263 | Grad Max: 0.015263 [GRADIENT NORM TOTAL] 3.8145 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.411 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5043904 0.4956096] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.533 | Std: 0.040 [MASKS] A(Pass/Fail): 578/1470 | B: 503/1545 | C: 268/1780 [LOSS Ex1] A: 0.66521 | B: 0.65845 | C: 0.65901 [LOGITS Ex2 A] Mean Abs: 1.668 | Max: 5.972 [LOSS Ex2] A: 0.17859 | B: 0.38637 | C: 0.30273 ** [JOINT LOSS] ** : 0.950118 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002836 | Grad Max: 0.061505 -> Layer: shared_layers.0.bias | Grad Mean: 0.171795 | Grad Max: 0.787494 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002034 | Grad Max: 0.006023 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004224 | Grad Max: 0.004224 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001239 | Grad Max: 0.146346 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022735 | Grad Max: 0.832709 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000235 | Grad Max: 0.008902 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011068 | Grad Max: 0.052146 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000413 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002394 | Grad Max: 0.005469 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000174 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000687 | Grad Max: 0.001753 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000995 | Grad Max: 0.002534 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017027 | Grad Max: 0.017027 [GRADIENT NORM TOTAL] 3.5688 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.337 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.52967405 0.47032598] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.532 | Std: 0.039 [MASKS] A(Pass/Fail): 583/1465 | B: 469/1387 | C: 284/1764 [LOSS Ex1] A: 0.66491 | B: 0.66164 | C: 0.65449 [LOGITS Ex2 A] Mean Abs: 1.671 | Max: 6.610 [LOSS Ex2] A: 0.19264 | B: 0.37756 | C: 0.32947 ** [JOINT LOSS] ** : 0.960237 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003998 | Grad Max: 0.116749 -> Layer: shared_layers.0.bias | Grad Mean: 0.177324 | Grad Max: 0.797929 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002120 | Grad Max: 0.006724 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004980 | Grad Max: 0.004980 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001232 | Grad Max: 0.144904 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022624 | Grad Max: 0.751381 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.008203 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010423 | Grad Max: 0.057257 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000389 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002298 | Grad Max: 0.005415 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000175 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000664 | Grad Max: 0.001589 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000836 | Grad Max: 0.001968 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015523 | Grad Max: 0.015523 [GRADIENT NORM TOTAL] 3.5850 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.473 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6369923 0.36300772] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.041 [MASKS] A(Pass/Fail): 654/1394 | B: 489/1559 | C: 288/1760 [LOSS Ex1] A: 0.66117 | B: 0.66094 | C: 0.65465 [LOGITS Ex2 A] Mean Abs: 1.720 | Max: 6.115 [LOSS Ex2] A: 0.16711 | B: 0.39581 | C: 0.29338 ** [JOINT LOSS] ** : 0.944351 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.041499 -> Layer: shared_layers.0.bias | Grad Mean: 0.084743 | Grad Max: 0.363287 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002238 | Grad Max: 0.007833 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007818 | Grad Max: 0.007818 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000718 | Grad Max: 0.110685 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012701 | Grad Max: 0.627129 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000118 | Grad Max: 0.004844 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005477 | Grad Max: 0.027689 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000254 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001208 | Grad Max: 0.003592 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000112 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000361 | Grad Max: 0.000988 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000507 | Grad Max: 0.001892 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009535 | Grad Max: 0.009535 [GRADIENT NORM TOTAL] 2.1304 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.530 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000474 0.49995264] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.534 | Std: 0.040 [MASKS] A(Pass/Fail): 629/1419 | B: 506/1542 | C: 268/1780 [LOSS Ex1] A: 0.66555 | B: 0.66154 | C: 0.65644 [LOGITS Ex2 A] Mean Abs: 1.704 | Max: 5.743 [LOSS Ex2] A: 0.17624 | B: 0.40173 | C: 0.29929 ** [JOINT LOSS] ** : 0.953598 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005826 | Grad Max: 0.190989 -> Layer: shared_layers.0.bias | Grad Mean: 0.252968 | Grad Max: 1.026355 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002031 | Grad Max: 0.006574 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003423 | Grad Max: 0.003423 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001740 | Grad Max: 0.197425 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031585 | Grad Max: 1.067955 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000316 | Grad Max: 0.009404 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014770 | Grad Max: 0.063298 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000507 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003337 | Grad Max: 0.007129 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000245 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000971 | Grad Max: 0.002408 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001356 | Grad Max: 0.002723 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023758 | Grad Max: 0.023758 [GRADIENT NORM TOTAL] 4.8743 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.330 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6135236 0.38647643] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.534 | Std: 0.041 [MASKS] A(Pass/Fail): 608/1440 | B: 504/1544 | C: 268/1780 [LOSS Ex1] A: 0.66298 | B: 0.65825 | C: 0.65690 [LOGITS Ex2 A] Mean Abs: 1.716 | Max: 5.878 [LOSS Ex2] A: 0.18972 | B: 0.38331 | C: 0.31088 ** [JOINT LOSS] ** : 0.954010 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003780 | Grad Max: 0.092864 -> Layer: shared_layers.0.bias | Grad Mean: 0.170660 | Grad Max: 0.758402 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002174 | Grad Max: 0.007554 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004585 | Grad Max: 0.004585 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001080 | Grad Max: 0.121937 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019732 | Grad Max: 0.619565 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000195 | Grad Max: 0.007754 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009200 | Grad Max: 0.051600 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000366 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002044 | Grad Max: 0.004993 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000163 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000578 | Grad Max: 0.001631 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000708 | Grad Max: 0.001723 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013410 | Grad Max: 0.013410 [GRADIENT NORM TOTAL] 3.1471 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.406 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5719 0.42809996] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.041 [MASKS] A(Pass/Fail): 502/1114 | B: 471/1385 | C: 174/1202 [LOSS Ex1] A: 0.66177 | B: 0.66144 | C: 0.65587 [LOGITS Ex2 A] Mean Abs: 1.779 | Max: 5.948 [LOSS Ex2] A: 0.17850 | B: 0.36575 | C: 0.29837 ** [JOINT LOSS] ** : 0.940571 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.086433 -> Layer: shared_layers.0.bias | Grad Mean: 0.198509 | Grad Max: 0.977351 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.006954 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001283 | Grad Max: 0.001283 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001323 | Grad Max: 0.147752 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024396 | Grad Max: 0.827373 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000234 | Grad Max: 0.008620 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011177 | Grad Max: 0.049957 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000384 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002407 | Grad Max: 0.005618 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000176 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000705 | Grad Max: 0.001637 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000912 | Grad Max: 0.002461 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017572 | Grad Max: 0.017572 [GRADIENT NORM TOTAL] 4.1565 [EPOCH SUMMARY] Train Loss: 0.9535 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9353 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9360 -> New: 0.9353) ############################## EPOCH 73/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.532 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062554 0.4937446] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.535 | Std: 0.041 [MASKS] A(Pass/Fail): 635/1413 | B: 490/1558 | C: 263/1785 [LOSS Ex1] A: 0.66198 | B: 0.66074 | C: 0.65550 [LOGITS Ex2 A] Mean Abs: 1.764 | Max: 8.323 [LOSS Ex2] A: 0.18478 | B: 0.40067 | C: 0.29184 ** [JOINT LOSS] ** : 0.951840 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002935 | Grad Max: 0.074177 -> Layer: shared_layers.0.bias | Grad Mean: 0.225713 | Grad Max: 0.982091 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.007444 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005924 | Grad Max: 0.005924 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001538 | Grad Max: 0.151925 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028775 | Grad Max: 0.851188 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.009525 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013797 | Grad Max: 0.068633 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000461 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003008 | Grad Max: 0.006850 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000215 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000878 | Grad Max: 0.002101 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001182 | Grad Max: 0.002772 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022154 | Grad Max: 0.022154 [GRADIENT NORM TOTAL] 4.6186 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.480 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50807905 0.49192095] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.535 | Std: 0.041 [MASKS] A(Pass/Fail): 609/1439 | B: 510/1538 | C: 250/1798 [LOSS Ex1] A: 0.66031 | B: 0.66134 | C: 0.65789 [LOGITS Ex2 A] Mean Abs: 1.728 | Max: 5.516 [LOSS Ex2] A: 0.18242 | B: 0.39621 | C: 0.32467 ** [JOINT LOSS] ** : 0.960945 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001850 | Grad Max: 0.045483 -> Layer: shared_layers.0.bias | Grad Mean: 0.040156 | Grad Max: 0.209246 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002325 | Grad Max: 0.008380 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015103 | Grad Max: 0.015103 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000437 | Grad Max: 0.119613 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007142 | Grad Max: 0.662676 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.003517 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001496 | Grad Max: 0.013644 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000134 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000241 | Grad Max: 0.001505 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000058 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000073 | Grad Max: 0.000471 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000188 | Grad Max: 0.000777 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001042 | Grad Max: 0.001042 [GRADIENT NORM TOTAL] 1.3585 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.495 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016087 0.4983913] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.535 | Std: 0.041 [MASKS] A(Pass/Fail): 622/1426 | B: 504/1544 | C: 254/1794 [LOSS Ex1] A: 0.65931 | B: 0.65804 | C: 0.65724 [LOGITS Ex2 A] Mean Abs: 1.690 | Max: 6.732 [LOSS Ex2] A: 0.20295 | B: 0.39099 | C: 0.29950 ** [JOINT LOSS] ** : 0.956007 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003806 | Grad Max: 0.100134 -> Layer: shared_layers.0.bias | Grad Mean: 0.258279 | Grad Max: 1.224548 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002313 | Grad Max: 0.008319 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009592 | Grad Max: 0.009592 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001689 | Grad Max: 0.346910 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031510 | Grad Max: 1.956380 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000297 | Grad Max: 0.008787 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014208 | Grad Max: 0.063214 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000463 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003000 | Grad Max: 0.006857 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000209 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000849 | Grad Max: 0.002216 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001141 | Grad Max: 0.002282 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020629 | Grad Max: 0.020629 [GRADIENT NORM TOTAL] 5.4977 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.416 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.504375 0.49562502] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.533 | Std: 0.040 [MASKS] A(Pass/Fail): 584/1464 | B: 473/1383 | C: 283/1765 [LOSS Ex1] A: 0.66486 | B: 0.66123 | C: 0.65559 [LOGITS Ex2 A] Mean Abs: 1.679 | Max: 5.880 [LOSS Ex2] A: 0.17868 | B: 0.37508 | C: 0.27429 ** [JOINT LOSS] ** : 0.936578 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003993 | Grad Max: 0.098105 -> Layer: shared_layers.0.bias | Grad Mean: 0.207280 | Grad Max: 1.010794 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002004 | Grad Max: 0.006063 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002022 | Grad Max: 0.002022 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001468 | Grad Max: 0.246577 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027207 | Grad Max: 1.387417 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000270 | Grad Max: 0.009316 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012722 | Grad Max: 0.057906 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000466 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002769 | Grad Max: 0.006416 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000194 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000777 | Grad Max: 0.001956 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000994 | Grad Max: 0.002591 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017549 | Grad Max: 0.017549 [GRADIENT NORM TOTAL] 4.4122 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.076 | Max: 0.342 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5300262 0.46997383] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.533 | Std: 0.039 [MASKS] A(Pass/Fail): 586/1462 | B: 491/1557 | C: 290/1758 [LOSS Ex1] A: 0.66458 | B: 0.66053 | C: 0.65626 [LOGITS Ex2 A] Mean Abs: 1.701 | Max: 5.874 [LOSS Ex2] A: 0.18775 | B: 0.40231 | C: 0.29261 ** [JOINT LOSS] ** : 0.954680 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004413 | Grad Max: 0.115910 -> Layer: shared_layers.0.bias | Grad Mean: 0.250935 | Grad Max: 0.903025 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001995 | Grad Max: 0.006628 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005240 | Grad Max: 0.005240 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001683 | Grad Max: 0.187139 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031425 | Grad Max: 1.045301 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000308 | Grad Max: 0.010065 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014695 | Grad Max: 0.071098 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000502 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003234 | Grad Max: 0.007175 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000237 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000939 | Grad Max: 0.002219 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001253 | Grad Max: 0.003177 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023251 | Grad Max: 0.023251 [GRADIENT NORM TOTAL] 5.0123 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.477 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63904685 0.36095318] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.041 [MASKS] A(Pass/Fail): 658/1390 | B: 510/1538 | C: 266/1782 [LOSS Ex1] A: 0.66079 | B: 0.66113 | C: 0.65668 [LOGITS Ex2 A] Mean Abs: 1.749 | Max: 6.672 [LOSS Ex2] A: 0.18968 | B: 0.40676 | C: 0.33208 ** [JOINT LOSS] ** : 0.969044 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004413 | Grad Max: 0.132613 -> Layer: shared_layers.0.bias | Grad Mean: 0.364266 | Grad Max: 1.736161 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002236 | Grad Max: 0.007855 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011474 | Grad Max: 0.011474 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002289 | Grad Max: 0.211380 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042839 | Grad Max: 1.187697 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000446 | Grad Max: 0.014465 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021496 | Grad Max: 0.103471 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000742 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004670 | Grad Max: 0.010675 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000308 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001350 | Grad Max: 0.003163 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001815 | Grad Max: 0.003645 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032853 | Grad Max: 0.032853 [GRADIENT NORM TOTAL] 7.1584 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.535 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50013083 0.4998692 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.534 | Std: 0.041 [MASKS] A(Pass/Fail): 632/1416 | B: 506/1542 | C: 264/1784 [LOSS Ex1] A: 0.66523 | B: 0.65783 | C: 0.65584 [LOGITS Ex2 A] Mean Abs: 1.726 | Max: 5.841 [LOSS Ex2] A: 0.17618 | B: 0.37404 | C: 0.29690 ** [JOINT LOSS] ** : 0.942007 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002437 | Grad Max: 0.080562 -> Layer: shared_layers.0.bias | Grad Mean: 0.065183 | Grad Max: 0.350468 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.006815 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005661 | Grad Max: 0.005661 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000606 | Grad Max: 0.082973 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010007 | Grad Max: 0.467036 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.003981 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003290 | Grad Max: 0.022472 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000225 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000619 | Grad Max: 0.002611 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000181 | Grad Max: 0.000823 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000556 | Grad Max: 0.001634 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004970 | Grad Max: 0.004970 [GRADIENT NORM TOTAL] 1.6231 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.335 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61513406 0.38486597] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.534 | Std: 0.041 [MASKS] A(Pass/Fail): 612/1436 | B: 473/1383 | C: 288/1760 [LOSS Ex1] A: 0.66264 | B: 0.66103 | C: 0.65544 [LOGITS Ex2 A] Mean Abs: 1.686 | Max: 6.209 [LOSS Ex2] A: 0.19207 | B: 0.39056 | C: 0.33784 ** [JOINT LOSS] ** : 0.966529 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007493 | Grad Max: 0.176201 -> Layer: shared_layers.0.bias | Grad Mean: 0.498383 | Grad Max: 2.076265 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002048 | Grad Max: 0.006531 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003602 | Grad Max: 0.003602 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003120 | Grad Max: 0.330746 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058721 | Grad Max: 1.842866 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000605 | Grad Max: 0.019010 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029023 | Grad Max: 0.131452 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000911 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006346 | Grad Max: 0.013403 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000388 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001827 | Grad Max: 0.004242 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002442 | Grad Max: 0.004547 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044303 | Grad Max: 0.044303 [GRADIENT NORM TOTAL] 9.2727 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.411 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57294804 0.427052 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.041 [MASKS] A(Pass/Fail): 504/1112 | B: 493/1555 | C: 259/1789 [LOSS Ex1] A: 0.66143 | B: 0.66033 | C: 0.65875 [LOGITS Ex2 A] Mean Abs: 1.717 | Max: 5.686 [LOSS Ex2] A: 0.18481 | B: 0.43425 | C: 0.33937 ** [JOINT LOSS] ** : 0.979647 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008092 | Grad Max: 0.206760 -> Layer: shared_layers.0.bias | Grad Mean: 0.635454 | Grad Max: 2.666595 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.006802 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004852 | Grad Max: 0.004852 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003971 | Grad Max: 0.414810 -> Layer: exit2_layers.0.bias | Grad Mean: 0.075184 | Grad Max: 2.303638 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000778 | Grad Max: 0.023819 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037430 | Grad Max: 0.175566 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001201 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008124 | Grad Max: 0.016397 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000481 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002358 | Grad Max: 0.005279 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003282 | Grad Max: 0.006487 -> Layer: exit2_layers.12.bias | Grad Mean: 0.059285 | Grad Max: 0.059285 [GRADIENT NORM TOTAL] 12.2112 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.537 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50612056 0.4938794 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 637/1411 | B: 510/1538 | C: 257/1791 [LOSS Ex1] A: 0.66164 | B: 0.66094 | C: 0.65860 [LOGITS Ex2 A] Mean Abs: 1.716 | Max: 7.204 [LOSS Ex2] A: 0.18007 | B: 0.40679 | C: 0.30827 ** [JOINT LOSS] ** : 0.958771 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004062 | Grad Max: 0.124543 -> Layer: shared_layers.0.bias | Grad Mean: 0.358357 | Grad Max: 1.543433 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002041 | Grad Max: 0.006349 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001608 | Grad Max: 0.001608 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002223 | Grad Max: 0.247619 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042047 | Grad Max: 1.403629 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000447 | Grad Max: 0.015734 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021594 | Grad Max: 0.106267 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000706 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004617 | Grad Max: 0.010160 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000296 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001327 | Grad Max: 0.002923 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001863 | Grad Max: 0.003196 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033095 | Grad Max: 0.033095 [GRADIENT NORM TOTAL] 6.9557 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.486 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083471 0.49165288] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.535 | Std: 0.041 [MASKS] A(Pass/Fail): 612/1436 | B: 508/1540 | C: 278/1770 [LOSS Ex1] A: 0.65994 | B: 0.65763 | C: 0.65566 [LOGITS Ex2 A] Mean Abs: 1.774 | Max: 6.016 [LOSS Ex2] A: 0.18947 | B: 0.38606 | C: 0.31104 ** [JOINT LOSS] ** : 0.953267 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006855 | Grad Max: 0.204276 -> Layer: shared_layers.0.bias | Grad Mean: 0.331515 | Grad Max: 1.448843 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.007400 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002464 | Grad Max: 0.002464 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002290 | Grad Max: 0.231249 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042120 | Grad Max: 1.184505 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.011412 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019372 | Grad Max: 0.078371 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000646 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004300 | Grad Max: 0.009401 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000302 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001248 | Grad Max: 0.003100 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001709 | Grad Max: 0.003319 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030834 | Grad Max: 0.030834 [GRADIENT NORM TOTAL] 6.5553 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.500 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014658 0.49853417] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.535 | Std: 0.041 [MASKS] A(Pass/Fail): 623/1425 | B: 473/1383 | C: 283/1765 [LOSS Ex1] A: 0.65895 | B: 0.66085 | C: 0.65585 [LOGITS Ex2 A] Mean Abs: 1.770 | Max: 7.239 [LOSS Ex2] A: 0.22558 | B: 0.38419 | C: 0.32377 ** [JOINT LOSS] ** : 0.969730 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010562 | Grad Max: 0.341549 -> Layer: shared_layers.0.bias | Grad Mean: 0.557792 | Grad Max: 2.313211 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.007606 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000784 | Grad Max: 0.000784 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003841 | Grad Max: 0.346128 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070943 | Grad Max: 1.850459 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000702 | Grad Max: 0.018901 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033322 | Grad Max: 0.143137 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.001075 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007385 | Grad Max: 0.016077 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000500 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002146 | Grad Max: 0.005283 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002919 | Grad Max: 0.005083 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052059 | Grad Max: 0.052059 [GRADIENT NORM TOTAL] 10.9466 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.420 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041726 0.49582738] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.534 | Std: 0.041 [MASKS] A(Pass/Fail): 590/1458 | B: 494/1554 | C: 281/1767 [LOSS Ex1] A: 0.66458 | B: 0.66015 | C: 0.65219 [LOGITS Ex2 A] Mean Abs: 1.753 | Max: 6.769 [LOSS Ex2] A: 0.18399 | B: 0.40133 | C: 0.30008 ** [JOINT LOSS] ** : 0.954107 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007210 | Grad Max: 0.211014 -> Layer: shared_layers.0.bias | Grad Mean: 0.362456 | Grad Max: 1.623585 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.006377 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001979 | Grad Max: 0.001979 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002375 | Grad Max: 0.238710 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043610 | Grad Max: 1.175551 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000427 | Grad Max: 0.012406 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020254 | Grad Max: 0.092614 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000758 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004530 | Grad Max: 0.010533 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000290 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001322 | Grad Max: 0.003107 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001756 | Grad Max: 0.003577 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032265 | Grad Max: 0.032265 [GRADIENT NORM TOTAL] 6.9576 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.077 | Max: 0.346 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5304678 0.46953222] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.533 | Std: 0.040 [MASKS] A(Pass/Fail): 588/1460 | B: 510/1538 | C: 197/1179 [LOSS Ex1] A: 0.66430 | B: 0.66078 | C: 0.65424 [LOGITS Ex2 A] Mean Abs: 1.660 | Max: 6.278 [LOSS Ex2] A: 0.18396 | B: 0.40069 | C: 0.31378 ** [JOINT LOSS] ** : 0.959250 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.071598 -> Layer: shared_layers.0.bias | Grad Mean: 0.163129 | Grad Max: 0.862666 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002089 | Grad Max: 0.006609 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005358 | Grad Max: 0.005358 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001103 | Grad Max: 0.157590 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020034 | Grad Max: 0.883447 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000190 | Grad Max: 0.007440 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009252 | Grad Max: 0.053541 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000362 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001995 | Grad Max: 0.004881 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000156 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000585 | Grad Max: 0.001607 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000846 | Grad Max: 0.001777 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015312 | Grad Max: 0.015312 [GRADIENT NORM TOTAL] 3.4284 [EPOCH SUMMARY] Train Loss: 0.9580 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9346 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9353 -> New: 0.9346) ############################## EPOCH 74/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.480 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6408551 0.35914493] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.042 [MASKS] A(Pass/Fail): 660/1388 | B: 511/1537 | C: 264/1784 [LOSS Ex1] A: 0.66046 | B: 0.65747 | C: 0.65498 [LOGITS Ex2 A] Mean Abs: 1.710 | Max: 6.486 [LOSS Ex2] A: 0.16524 | B: 0.38674 | C: 0.29831 ** [JOINT LOSS] ** : 0.941070 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003597 | Grad Max: 0.083479 -> Layer: shared_layers.0.bias | Grad Mean: 0.254006 | Grad Max: 1.231914 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002275 | Grad Max: 0.007762 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008000 | Grad Max: 0.008000 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001598 | Grad Max: 0.182312 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029967 | Grad Max: 1.021170 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000312 | Grad Max: 0.011916 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014977 | Grad Max: 0.074023 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000581 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003232 | Grad Max: 0.007975 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000236 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000916 | Grad Max: 0.002447 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001180 | Grad Max: 0.002706 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021406 | Grad Max: 0.021406 [GRADIENT NORM TOTAL] 4.9766 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.539 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003218 0.49967813] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.041 [MASKS] A(Pass/Fail): 632/1416 | B: 473/1383 | C: 240/1808 [LOSS Ex1] A: 0.66496 | B: 0.66070 | C: 0.65884 [LOGITS Ex2 A] Mean Abs: 1.731 | Max: 5.907 [LOSS Ex2] A: 0.17575 | B: 0.36742 | C: 0.30575 ** [JOINT LOSS] ** : 0.944471 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004261 | Grad Max: 0.135296 -> Layer: shared_layers.0.bias | Grad Mean: 0.101943 | Grad Max: 0.404835 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.006967 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008970 | Grad Max: 0.008970 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000778 | Grad Max: 0.108192 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013339 | Grad Max: 0.553028 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000129 | Grad Max: 0.003638 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005890 | Grad Max: 0.026240 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000291 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001368 | Grad Max: 0.003755 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000127 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000395 | Grad Max: 0.001343 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000607 | Grad Max: 0.001780 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009547 | Grad Max: 0.009547 [GRADIENT NORM TOTAL] 2.0575 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.339 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6164469 0.38355315] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 613/1435 | B: 495/1553 | C: 257/1791 [LOSS Ex1] A: 0.66236 | B: 0.66000 | C: 0.65755 [LOGITS Ex2 A] Mean Abs: 1.773 | Max: 5.776 [LOSS Ex2] A: 0.19534 | B: 0.41343 | C: 0.31515 ** [JOINT LOSS] ** : 0.967943 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005409 | Grad Max: 0.153562 -> Layer: shared_layers.0.bias | Grad Mean: 0.454520 | Grad Max: 1.992226 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002038 | Grad Max: 0.006873 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000774 | Grad Max: 0.000774 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002926 | Grad Max: 0.275385 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055001 | Grad Max: 1.553204 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000549 | Grad Max: 0.017570 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026639 | Grad Max: 0.124385 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000844 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005765 | Grad Max: 0.012078 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000389 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001648 | Grad Max: 0.004146 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002090 | Grad Max: 0.003653 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039359 | Grad Max: 0.039359 [GRADIENT NORM TOTAL] 9.0802 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.415 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5737555 0.4262445] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.042 [MASKS] A(Pass/Fail): 506/1110 | B: 511/1537 | C: 291/1757 [LOSS Ex1] A: 0.66115 | B: 0.66062 | C: 0.65438 [LOGITS Ex2 A] Mean Abs: 1.820 | Max: 5.983 [LOSS Ex2] A: 0.18612 | B: 0.41808 | C: 0.33512 ** [JOINT LOSS] ** : 0.971826 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007920 | Grad Max: 0.211429 -> Layer: shared_layers.0.bias | Grad Mean: 0.638975 | Grad Max: 2.812143 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.007114 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001332 | Grad Max: 0.001332 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004114 | Grad Max: 0.400267 -> Layer: exit2_layers.0.bias | Grad Mean: 0.077609 | Grad Max: 2.170432 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000766 | Grad Max: 0.024477 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036912 | Grad Max: 0.172528 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000099 | Grad Max: 0.001190 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007983 | Grad Max: 0.017391 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000494 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002283 | Grad Max: 0.005344 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002967 | Grad Max: 0.005292 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053963 | Grad Max: 0.053963 [GRADIENT NORM TOTAL] 12.5498 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.541 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50609314 0.49390683] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.042 [MASKS] A(Pass/Fail): 639/1409 | B: 512/1536 | C: 274/1774 [LOSS Ex1] A: 0.66136 | B: 0.65731 | C: 0.65480 [LOGITS Ex2 A] Mean Abs: 1.794 | Max: 6.383 [LOSS Ex2] A: 0.17984 | B: 0.38492 | C: 0.30531 ** [JOINT LOSS] ** : 0.947846 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003470 | Grad Max: 0.114626 -> Layer: shared_layers.0.bias | Grad Mean: 0.336455 | Grad Max: 1.587149 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.006833 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000760 | Grad Max: 0.000760 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.268503 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040115 | Grad Max: 1.489549 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000379 | Grad Max: 0.014336 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018395 | Grad Max: 0.102598 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000562 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003971 | Grad Max: 0.008265 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000258 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001145 | Grad Max: 0.002570 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001496 | Grad Max: 0.003286 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027594 | Grad Max: 0.027594 [GRADIENT NORM TOTAL] 6.8119 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.490 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50853235 0.49146762] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 615/1433 | B: 473/1383 | C: 295/1753 [LOSS Ex1] A: 0.65964 | B: 0.66054 | C: 0.65496 [LOGITS Ex2 A] Mean Abs: 1.738 | Max: 5.971 [LOSS Ex2] A: 0.17516 | B: 0.37358 | C: 0.29120 ** [JOINT LOSS] ** : 0.938360 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002468 | Grad Max: 0.074818 -> Layer: shared_layers.0.bias | Grad Mean: 0.136415 | Grad Max: 0.511650 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.007674 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006742 | Grad Max: 0.006742 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000965 | Grad Max: 0.185011 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017244 | Grad Max: 1.032131 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000151 | Grad Max: 0.006486 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007119 | Grad Max: 0.040608 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000342 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001533 | Grad Max: 0.003912 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000155 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000437 | Grad Max: 0.001415 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000558 | Grad Max: 0.001749 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009818 | Grad Max: 0.009818 [GRADIENT NORM TOTAL] 3.1511 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.505 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014651 0.49853495] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 625/1423 | B: 496/1552 | C: 264/1784 [LOSS Ex1] A: 0.65866 | B: 0.65985 | C: 0.65654 [LOGITS Ex2 A] Mean Abs: 1.717 | Max: 6.797 [LOSS Ex2] A: 0.19030 | B: 0.41861 | C: 0.31825 ** [JOINT LOSS] ** : 0.967401 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003448 | Grad Max: 0.152888 -> Layer: shared_layers.0.bias | Grad Mean: 0.367461 | Grad Max: 1.876357 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.008010 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005072 | Grad Max: 0.005072 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002354 | Grad Max: 0.321593 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044251 | Grad Max: 1.799676 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000423 | Grad Max: 0.014086 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020538 | Grad Max: 0.097044 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000641 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004420 | Grad Max: 0.009901 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000286 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001283 | Grad Max: 0.003017 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001708 | Grad Max: 0.003246 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031574 | Grad Max: 0.031574 [GRADIENT NORM TOTAL] 7.7611 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.423 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040328 0.4959672] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.534 | Std: 0.041 [MASKS] A(Pass/Fail): 593/1455 | B: 511/1537 | C: 279/1769 [LOSS Ex1] A: 0.66434 | B: 0.66048 | C: 0.65623 [LOGITS Ex2 A] Mean Abs: 1.702 | Max: 6.472 [LOSS Ex2] A: 0.17792 | B: 0.39534 | C: 0.29356 ** [JOINT LOSS] ** : 0.949288 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002514 | Grad Max: 0.071267 -> Layer: shared_layers.0.bias | Grad Mean: 0.191097 | Grad Max: 0.903039 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002038 | Grad Max: 0.006075 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006312 | Grad Max: 0.006312 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001311 | Grad Max: 0.181819 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024141 | Grad Max: 0.986535 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.009265 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011761 | Grad Max: 0.065312 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000398 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002475 | Grad Max: 0.005443 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000182 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000705 | Grad Max: 0.001653 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000910 | Grad Max: 0.002126 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016882 | Grad Max: 0.016882 [GRADIENT NORM TOTAL] 4.1071 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.078 | Max: 0.350 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5308369 0.4691631] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.534 | Std: 0.040 [MASKS] A(Pass/Fail): 589/1459 | B: 513/1535 | C: 279/1769 [LOSS Ex1] A: 0.66407 | B: 0.65716 | C: 0.65480 [LOGITS Ex2 A] Mean Abs: 1.713 | Max: 6.328 [LOSS Ex2] A: 0.18615 | B: 0.37939 | C: 0.29864 ** [JOINT LOSS] ** : 0.946737 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005728 | Grad Max: 0.174757 -> Layer: shared_layers.0.bias | Grad Mean: 0.266629 | Grad Max: 1.191052 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006538 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006526 | Grad Max: 0.006526 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001797 | Grad Max: 0.205774 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032988 | Grad Max: 1.040036 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000325 | Grad Max: 0.010173 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015516 | Grad Max: 0.067375 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000545 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003441 | Grad Max: 0.007668 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000242 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000994 | Grad Max: 0.002298 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001261 | Grad Max: 0.003077 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024109 | Grad Max: 0.024109 [GRADIENT NORM TOTAL] 5.2357 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.484 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64239913 0.35760087] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.042 [MASKS] A(Pass/Fail): 661/1387 | B: 473/1383 | C: 279/1769 [LOSS Ex1] A: 0.66018 | B: 0.66040 | C: 0.65452 [LOGITS Ex2 A] Mean Abs: 1.753 | Max: 6.385 [LOSS Ex2] A: 0.17720 | B: 0.37250 | C: 0.31691 ** [JOINT LOSS] ** : 0.947238 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005437 | Grad Max: 0.145249 -> Layer: shared_layers.0.bias | Grad Mean: 0.303123 | Grad Max: 1.389012 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002227 | Grad Max: 0.007765 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010381 | Grad Max: 0.010381 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.216676 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038423 | Grad Max: 1.162578 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000384 | Grad Max: 0.011669 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018452 | Grad Max: 0.087296 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000642 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004082 | Grad Max: 0.008853 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000276 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001195 | Grad Max: 0.002807 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001646 | Grad Max: 0.003687 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030009 | Grad Max: 0.030009 [GRADIENT NORM TOTAL] 6.0780 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.543 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004664 0.4995336] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 633/1415 | B: 497/1551 | C: 290/1758 [LOSS Ex1] A: 0.66472 | B: 0.65971 | C: 0.65225 [LOGITS Ex2 A] Mean Abs: 1.732 | Max: 6.042 [LOSS Ex2] A: 0.17282 | B: 0.40741 | C: 0.28893 ** [JOINT LOSS] ** : 0.948610 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.071667 -> Layer: shared_layers.0.bias | Grad Mean: 0.046596 | Grad Max: 0.221297 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.006677 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003201 | Grad Max: 0.003201 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000459 | Grad Max: 0.062611 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007468 | Grad Max: 0.346441 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.003803 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002185 | Grad Max: 0.023501 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000155 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000342 | Grad Max: 0.001832 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000065 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000091 | Grad Max: 0.000521 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000361 | Grad Max: 0.001045 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002013 | Grad Max: 0.002013 [GRADIENT NORM TOTAL] 1.1469 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.343 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61766315 0.38233688] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 615/1433 | B: 514/1534 | C: 272/1776 [LOSS Ex1] A: 0.66210 | B: 0.66034 | C: 0.65599 [LOGITS Ex2 A] Mean Abs: 1.696 | Max: 5.673 [LOSS Ex2] A: 0.19174 | B: 0.40773 | C: 0.29849 ** [JOINT LOSS] ** : 0.958800 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005719 | Grad Max: 0.136227 -> Layer: shared_layers.0.bias | Grad Mean: 0.392534 | Grad Max: 1.762053 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.007040 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003156 | Grad Max: 0.003156 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002486 | Grad Max: 0.246528 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046750 | Grad Max: 1.351282 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000466 | Grad Max: 0.013054 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022481 | Grad Max: 0.092767 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000703 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004897 | Grad Max: 0.010607 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000315 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001416 | Grad Max: 0.003455 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001886 | Grad Max: 0.003480 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034566 | Grad Max: 0.034566 [GRADIENT NORM TOTAL] 7.4744 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.419 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57454455 0.4254554 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.042 [MASKS] A(Pass/Fail): 507/1109 | B: 513/1535 | C: 267/1781 [LOSS Ex1] A: 0.66088 | B: 0.65701 | C: 0.65590 [LOGITS Ex2 A] Mean Abs: 1.732 | Max: 6.000 [LOSS Ex2] A: 0.17488 | B: 0.38857 | C: 0.30512 ** [JOINT LOSS] ** : 0.947453 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005670 | Grad Max: 0.146740 -> Layer: shared_layers.0.bias | Grad Mean: 0.434910 | Grad Max: 1.991657 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002219 | Grad Max: 0.006933 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002844 | Grad Max: 0.002844 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002783 | Grad Max: 0.343451 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052521 | Grad Max: 1.929182 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000525 | Grad Max: 0.016084 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025470 | Grad Max: 0.118153 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000740 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005562 | Grad Max: 0.011175 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000364 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001611 | Grad Max: 0.003719 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002229 | Grad Max: 0.003962 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039698 | Grad Max: 0.039698 [GRADIENT NORM TOTAL] 8.6660 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.545 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50607413 0.4939259 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 639/1409 | B: 475/1381 | C: 166/1210 [LOSS Ex1] A: 0.66109 | B: 0.66025 | C: 0.65683 [LOGITS Ex2 A] Mean Abs: 1.741 | Max: 6.029 [LOSS Ex2] A: 0.17784 | B: 0.37340 | C: 0.30609 ** [JOINT LOSS] ** : 0.945169 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003330 | Grad Max: 0.067401 -> Layer: shared_layers.0.bias | Grad Mean: 0.179443 | Grad Max: 0.837595 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002049 | Grad Max: 0.006479 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000997 | Grad Max: 0.000997 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001167 | Grad Max: 0.099099 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021414 | Grad Max: 0.556341 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000231 | Grad Max: 0.008204 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011051 | Grad Max: 0.054507 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000323 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002373 | Grad Max: 0.005391 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000167 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000686 | Grad Max: 0.001668 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000988 | Grad Max: 0.002230 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017387 | Grad Max: 0.017387 [GRADIENT NORM TOTAL] 3.4015 [EPOCH SUMMARY] Train Loss: 0.9516 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9420 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 75/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.493 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.508659 0.49134097] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.042 [MASKS] A(Pass/Fail): 617/1431 | B: 498/1550 | C: 267/1781 [LOSS Ex1] A: 0.65934 | B: 0.65955 | C: 0.65470 [LOGITS Ex2 A] Mean Abs: 1.805 | Max: 5.753 [LOSS Ex2] A: 0.19245 | B: 0.40734 | C: 0.30279 ** [JOINT LOSS] ** : 0.958725 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008820 | Grad Max: 0.271262 -> Layer: shared_layers.0.bias | Grad Mean: 0.477312 | Grad Max: 1.967880 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.008113 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006325 | Grad Max: 0.006325 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003193 | Grad Max: 0.325503 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059155 | Grad Max: 1.693394 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000574 | Grad Max: 0.015654 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027553 | Grad Max: 0.107040 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000827 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006126 | Grad Max: 0.012922 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000406 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001774 | Grad Max: 0.004224 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002341 | Grad Max: 0.004136 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042818 | Grad Max: 0.042818 [GRADIENT NORM TOTAL] 9.2480 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.509 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014511 0.49854892] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.042 [MASKS] A(Pass/Fail): 625/1423 | B: 515/1533 | C: 268/1780 [LOSS Ex1] A: 0.65837 | B: 0.66019 | C: 0.65597 [LOGITS Ex2 A] Mean Abs: 1.800 | Max: 6.933 [LOSS Ex2] A: 0.21913 | B: 0.41410 | C: 0.32946 ** [JOINT LOSS] ** : 0.979073 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012172 | Grad Max: 0.378636 -> Layer: shared_layers.0.bias | Grad Mean: 0.774389 | Grad Max: 3.326923 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.007287 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000460 | Grad Max: 0.000460 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005184 | Grad Max: 0.480590 -> Layer: exit2_layers.0.bias | Grad Mean: 0.095890 | Grad Max: 2.603522 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000935 | Grad Max: 0.024772 -> Layer: exit2_layers.3.bias | Grad Mean: 0.045187 | Grad Max: 0.188484 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000122 | Grad Max: 0.001413 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009996 | Grad Max: 0.020998 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000669 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002905 | Grad Max: 0.007066 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003913 | Grad Max: 0.006739 -> Layer: exit2_layers.12.bias | Grad Mean: 0.070738 | Grad Max: 0.070738 [GRADIENT NORM TOTAL] 15.2219 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.427 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50401217 0.4959878 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.534 | Std: 0.041 [MASKS] A(Pass/Fail): 595/1453 | B: 514/1534 | C: 280/1768 [LOSS Ex1] A: 0.66410 | B: 0.65685 | C: 0.65526 [LOGITS Ex2 A] Mean Abs: 1.752 | Max: 5.973 [LOSS Ex2] A: 0.18649 | B: 0.40208 | C: 0.31173 ** [JOINT LOSS] ** : 0.958838 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010126 | Grad Max: 0.280760 -> Layer: shared_layers.0.bias | Grad Mean: 0.611941 | Grad Max: 2.700320 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.006025 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003677 | Grad Max: 0.003677 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004008 | Grad Max: 0.420772 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074347 | Grad Max: 2.284436 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000730 | Grad Max: 0.022831 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035232 | Grad Max: 0.169613 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000096 | Grad Max: 0.001083 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007784 | Grad Max: 0.016435 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000497 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002258 | Grad Max: 0.005344 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002911 | Grad Max: 0.005272 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053901 | Grad Max: 0.053901 [GRADIENT NORM TOTAL] 11.7868 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.354 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53107405 0.46892592] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.534 | Std: 0.041 [MASKS] A(Pass/Fail): 591/1457 | B: 477/1379 | C: 276/1772 [LOSS Ex1] A: 0.66383 | B: 0.66012 | C: 0.65451 [LOGITS Ex2 A] Mean Abs: 1.697 | Max: 6.168 [LOSS Ex2] A: 0.18210 | B: 0.36551 | C: 0.31009 ** [JOINT LOSS] ** : 0.945387 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004310 | Grad Max: 0.137336 -> Layer: shared_layers.0.bias | Grad Mean: 0.140868 | Grad Max: 0.531879 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.006652 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000845 | Grad Max: 0.000845 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001047 | Grad Max: 0.114490 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018817 | Grad Max: 0.638174 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000185 | Grad Max: 0.005913 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008803 | Grad Max: 0.041494 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000361 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001986 | Grad Max: 0.004848 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000147 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000591 | Grad Max: 0.001431 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000827 | Grad Max: 0.002231 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015350 | Grad Max: 0.015350 [GRADIENT NORM TOTAL] 2.8297 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.488 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6437795 0.35622048] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.043 [MASKS] A(Pass/Fail): 662/1386 | B: 501/1547 | C: 288/1760 [LOSS Ex1] A: 0.65991 | B: 0.65942 | C: 0.65390 [LOGITS Ex2 A] Mean Abs: 1.697 | Max: 5.945 [LOSS Ex2] A: 0.17845 | B: 0.42957 | C: 0.31953 ** [JOINT LOSS] ** : 0.966928 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009328 | Grad Max: 0.212218 -> Layer: shared_layers.0.bias | Grad Mean: 0.651825 | Grad Max: 2.795566 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.007801 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011470 | Grad Max: 0.011470 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004169 | Grad Max: 0.465280 -> Layer: exit2_layers.0.bias | Grad Mean: 0.078541 | Grad Max: 2.603276 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000774 | Grad Max: 0.022486 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037439 | Grad Max: 0.163317 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001072 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008154 | Grad Max: 0.016661 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000529 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002365 | Grad Max: 0.005594 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003125 | Grad Max: 0.005947 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057299 | Grad Max: 0.057299 [GRADIENT NORM TOTAL] 12.7624 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.548 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50044197 0.49955803] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 634/1414 | B: 516/1532 | C: 258/1790 [LOSS Ex1] A: 0.66448 | B: 0.66007 | C: 0.65637 [LOGITS Ex2 A] Mean Abs: 1.656 | Max: 6.024 [LOSS Ex2] A: 0.19776 | B: 0.45858 | C: 0.34740 ** [JOINT LOSS] ** : 0.994887 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.016048 | Grad Max: 0.419653 -> Layer: shared_layers.0.bias | Grad Mean: 0.975248 | Grad Max: 4.135005 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.006548 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005007 | Grad Max: 0.005007 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006269 | Grad Max: 0.655250 -> Layer: exit2_layers.0.bias | Grad Mean: 0.117815 | Grad Max: 3.560012 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001182 | Grad Max: 0.032413 -> Layer: exit2_layers.3.bias | Grad Mean: 0.057244 | Grad Max: 0.248688 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000155 | Grad Max: 0.001610 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012596 | Grad Max: 0.025734 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000067 | Grad Max: 0.000737 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003670 | Grad Max: 0.008140 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004768 | Grad Max: 0.009722 -> Layer: exit2_layers.12.bias | Grad Mean: 0.089243 | Grad Max: 0.089243 [GRADIENT NORM TOTAL] 18.5665 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.346 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61865026 0.38134974] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 618/1430 | B: 515/1533 | C: 264/1784 [LOSS Ex1] A: 0.66185 | B: 0.65673 | C: 0.65584 [LOGITS Ex2 A] Mean Abs: 1.632 | Max: 6.375 [LOSS Ex2] A: 0.21864 | B: 0.44103 | C: 0.33473 ** [JOINT LOSS] ** : 0.989605 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014827 | Grad Max: 0.386849 -> Layer: shared_layers.0.bias | Grad Mean: 0.892660 | Grad Max: 3.725726 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.007030 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003850 | Grad Max: 0.003850 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005756 | Grad Max: 0.612693 -> Layer: exit2_layers.0.bias | Grad Mean: 0.108732 | Grad Max: 3.292723 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001098 | Grad Max: 0.030558 -> Layer: exit2_layers.3.bias | Grad Mean: 0.053023 | Grad Max: 0.222325 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000144 | Grad Max: 0.001551 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011610 | Grad Max: 0.022983 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000062 | Grad Max: 0.000698 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003370 | Grad Max: 0.007619 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004493 | Grad Max: 0.009296 -> Layer: exit2_layers.12.bias | Grad Mean: 0.081940 | Grad Max: 0.081940 [GRADIENT NORM TOTAL] 16.9552 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.422 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5751543 0.42484573] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 507/1109 | B: 480/1376 | C: 261/1787 [LOSS Ex1] A: 0.66064 | B: 0.66001 | C: 0.65597 [LOGITS Ex2 A] Mean Abs: 1.725 | Max: 6.173 [LOSS Ex2] A: 0.19389 | B: 0.39934 | C: 0.31714 ** [JOINT LOSS] ** : 0.962329 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010300 | Grad Max: 0.278950 -> Layer: shared_layers.0.bias | Grad Mean: 0.598087 | Grad Max: 2.506990 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.007043 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001452 | Grad Max: 0.001452 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003765 | Grad Max: 0.384322 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070627 | Grad Max: 2.077984 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000716 | Grad Max: 0.021713 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034564 | Grad Max: 0.160348 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.001016 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007614 | Grad Max: 0.015445 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000470 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002213 | Grad Max: 0.005139 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003030 | Grad Max: 0.005449 -> Layer: exit2_layers.12.bias | Grad Mean: 0.054420 | Grad Max: 0.054420 [GRADIENT NORM TOTAL] 11.1195 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.549 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5060636 0.49393642] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 640/1408 | B: 502/1546 | C: 270/1778 [LOSS Ex1] A: 0.66086 | B: 0.65931 | C: 0.65571 [LOGITS Ex2 A] Mean Abs: 1.749 | Max: 6.536 [LOSS Ex2] A: 0.17463 | B: 0.40185 | C: 0.30565 ** [JOINT LOSS] ** : 0.952668 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002675 | Grad Max: 0.094939 -> Layer: shared_layers.0.bias | Grad Mean: 0.133546 | Grad Max: 0.708245 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002107 | Grad Max: 0.006526 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003555 | Grad Max: 0.003555 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000934 | Grad Max: 0.110972 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016206 | Grad Max: 0.626096 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000130 | Grad Max: 0.005809 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006280 | Grad Max: 0.039229 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000289 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001335 | Grad Max: 0.003828 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000129 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000388 | Grad Max: 0.001138 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000448 | Grad Max: 0.001570 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009081 | Grad Max: 0.009081 [GRADIENT NORM TOTAL] 2.7678 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.497 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5086708 0.4913292] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 618/1430 | B: 517/1531 | C: 251/1797 [LOSS Ex1] A: 0.65911 | B: 0.65996 | C: 0.65707 [LOGITS Ex2 A] Mean Abs: 1.773 | Max: 5.868 [LOSS Ex2] A: 0.19169 | B: 0.41335 | C: 0.30100 ** [JOINT LOSS] ** : 0.960726 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005918 | Grad Max: 0.203445 -> Layer: shared_layers.0.bias | Grad Mean: 0.556879 | Grad Max: 2.621548 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.007242 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003972 | Grad Max: 0.003972 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003513 | Grad Max: 0.343868 -> Layer: exit2_layers.0.bias | Grad Mean: 0.066460 | Grad Max: 1.932683 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000646 | Grad Max: 0.021781 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031757 | Grad Max: 0.154622 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000982 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006896 | Grad Max: 0.015295 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000413 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002004 | Grad Max: 0.004744 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002550 | Grad Max: 0.004479 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048200 | Grad Max: 0.048200 [GRADIENT NORM TOTAL] 11.0775 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.512 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5014629 0.49853712] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.042 [MASKS] A(Pass/Fail): 625/1423 | B: 517/1531 | C: 285/1763 [LOSS Ex1] A: 0.65814 | B: 0.65662 | C: 0.65299 [LOGITS Ex2 A] Mean Abs: 1.773 | Max: 6.435 [LOSS Ex2] A: 0.19547 | B: 0.40192 | C: 0.32126 ** [JOINT LOSS] ** : 0.962130 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007180 | Grad Max: 0.197197 -> Layer: shared_layers.0.bias | Grad Mean: 0.587957 | Grad Max: 2.528712 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002351 | Grad Max: 0.008007 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006085 | Grad Max: 0.006085 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003853 | Grad Max: 0.374077 -> Layer: exit2_layers.0.bias | Grad Mean: 0.072151 | Grad Max: 2.090177 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000726 | Grad Max: 0.022301 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035320 | Grad Max: 0.157819 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001122 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007703 | Grad Max: 0.017093 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000481 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002245 | Grad Max: 0.005441 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002882 | Grad Max: 0.005064 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053870 | Grad Max: 0.053870 [GRADIENT NORM TOTAL] 11.7154 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.430 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040271 0.49597284] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.534 | Std: 0.042 [MASKS] A(Pass/Fail): 595/1453 | B: 482/1374 | C: 269/1779 [LOSS Ex1] A: 0.66390 | B: 0.65991 | C: 0.65530 [LOGITS Ex2 A] Mean Abs: 1.718 | Max: 6.244 [LOSS Ex2] A: 0.17651 | B: 0.36980 | C: 0.29667 ** [JOINT LOSS] ** : 0.940695 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003683 | Grad Max: 0.094974 -> Layer: shared_layers.0.bias | Grad Mean: 0.249495 | Grad Max: 1.116526 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.006212 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000846 | Grad Max: 0.000846 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001664 | Grad Max: 0.171817 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030739 | Grad Max: 0.952684 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000301 | Grad Max: 0.010627 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014734 | Grad Max: 0.071975 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000455 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003276 | Grad Max: 0.007032 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000224 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000981 | Grad Max: 0.002269 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001274 | Grad Max: 0.003225 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025040 | Grad Max: 0.025040 [GRADIENT NORM TOTAL] 5.0446 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.356 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5312386 0.46876144] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.534 | Std: 0.041 [MASKS] A(Pass/Fail): 594/1454 | B: 502/1546 | C: 283/1765 [LOSS Ex1] A: 0.66364 | B: 0.65921 | C: 0.65466 [LOGITS Ex2 A] Mean Abs: 1.651 | Max: 6.387 [LOSS Ex2] A: 0.18612 | B: 0.41765 | C: 0.29993 ** [JOINT LOSS] ** : 0.960405 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005341 | Grad Max: 0.137628 -> Layer: shared_layers.0.bias | Grad Mean: 0.462723 | Grad Max: 1.927409 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.007046 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008204 | Grad Max: 0.008204 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002895 | Grad Max: 0.317643 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054530 | Grad Max: 1.777605 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000551 | Grad Max: 0.016008 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026951 | Grad Max: 0.117312 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000756 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005900 | Grad Max: 0.012067 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000388 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001717 | Grad Max: 0.004236 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002167 | Grad Max: 0.003803 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040985 | Grad Max: 0.040985 [GRADIENT NORM TOTAL] 9.0132 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.491 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64482635 0.35517365] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.043 [MASKS] A(Pass/Fail): 663/1385 | B: 517/1531 | C: 208/1168 [LOSS Ex1] A: 0.65971 | B: 0.65987 | C: 0.65185 [LOGITS Ex2 A] Mean Abs: 1.680 | Max: 6.122 [LOSS Ex2] A: 0.17170 | B: 0.43048 | C: 0.34276 ** [JOINT LOSS] ** : 0.972120 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007217 | Grad Max: 0.205040 -> Layer: shared_layers.0.bias | Grad Mean: 0.642563 | Grad Max: 2.751889 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.006963 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003828 | Grad Max: 0.003828 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004039 | Grad Max: 0.413030 -> Layer: exit2_layers.0.bias | Grad Mean: 0.076199 | Grad Max: 2.318005 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000765 | Grad Max: 0.022387 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037433 | Grad Max: 0.159917 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000098 | Grad Max: 0.001053 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008196 | Grad Max: 0.016948 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000516 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002401 | Grad Max: 0.005676 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003006 | Grad Max: 0.006109 -> Layer: exit2_layers.12.bias | Grad Mean: 0.058096 | Grad Max: 0.058096 [GRADIENT NORM TOTAL] 12.5835 [EPOCH SUMMARY] Train Loss: 0.9646 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9430 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 76/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.551 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50042564 0.49957436] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.042 [MASKS] A(Pass/Fail): 635/1413 | B: 518/1530 | C: 263/1785 [LOSS Ex1] A: 0.66430 | B: 0.65652 | C: 0.65532 [LOGITS Ex2 A] Mean Abs: 1.688 | Max: 5.436 [LOSS Ex2] A: 0.17369 | B: 0.40355 | C: 0.32805 ** [JOINT LOSS] ** : 0.960476 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007215 | Grad Max: 0.175573 -> Layer: shared_layers.0.bias | Grad Mean: 0.520981 | Grad Max: 2.228011 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.006125 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002123 | Grad Max: 0.002123 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003303 | Grad Max: 0.323602 -> Layer: exit2_layers.0.bias | Grad Mean: 0.061692 | Grad Max: 1.787128 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000610 | Grad Max: 0.018344 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029792 | Grad Max: 0.132651 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000815 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006583 | Grad Max: 0.013256 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000413 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001928 | Grad Max: 0.004642 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002467 | Grad Max: 0.004649 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046444 | Grad Max: 0.046444 [GRADIENT NORM TOTAL] 10.0757 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.349 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6193959 0.38060412] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 618/1430 | B: 482/1374 | C: 295/1753 [LOSS Ex1] A: 0.66166 | B: 0.65982 | C: 0.65131 [LOGITS Ex2 A] Mean Abs: 1.710 | Max: 5.833 [LOSS Ex2] A: 0.19456 | B: 0.36767 | C: 0.28880 ** [JOINT LOSS] ** : 0.941270 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002395 | Grad Max: 0.086297 -> Layer: shared_layers.0.bias | Grad Mean: 0.116746 | Grad Max: 0.614690 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006947 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000049 | Grad Max: 0.000049 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000781 | Grad Max: 0.126645 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013711 | Grad Max: 0.722070 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000125 | Grad Max: 0.006642 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005914 | Grad Max: 0.040219 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000288 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001221 | Grad Max: 0.003586 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000114 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000339 | Grad Max: 0.001064 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000432 | Grad Max: 0.001590 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007128 | Grad Max: 0.007128 [GRADIENT NORM TOTAL] 2.3734 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.425 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5755389 0.4244611] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.043 [MASKS] A(Pass/Fail): 507/1109 | B: 502/1546 | C: 245/1803 [LOSS Ex1] A: 0.66044 | B: 0.65911 | C: 0.65859 [LOGITS Ex2 A] Mean Abs: 1.785 | Max: 6.157 [LOSS Ex2] A: 0.19228 | B: 0.41923 | C: 0.32623 ** [JOINT LOSS] ** : 0.971961 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009604 | Grad Max: 0.227020 -> Layer: shared_layers.0.bias | Grad Mean: 0.613315 | Grad Max: 2.654439 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002024 | Grad Max: 0.006749 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001154 | Grad Max: 0.001154 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004005 | Grad Max: 0.353312 -> Layer: exit2_layers.0.bias | Grad Mean: 0.075460 | Grad Max: 1.928097 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000749 | Grad Max: 0.021833 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036463 | Grad Max: 0.163876 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001070 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008012 | Grad Max: 0.016831 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000475 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002331 | Grad Max: 0.005367 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002869 | Grad Max: 0.004928 -> Layer: exit2_layers.12.bias | Grad Mean: 0.054567 | Grad Max: 0.054567 [GRADIENT NORM TOTAL] 11.9006 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.552 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061264 0.49387363] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 640/1408 | B: 518/1530 | C: 260/1788 [LOSS Ex1] A: 0.66066 | B: 0.65977 | C: 0.65648 [LOGITS Ex2 A] Mean Abs: 1.801 | Max: 6.926 [LOSS Ex2] A: 0.20986 | B: 0.43662 | C: 0.35113 ** [JOINT LOSS] ** : 0.991508 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014648 | Grad Max: 0.358917 -> Layer: shared_layers.0.bias | Grad Mean: 0.929795 | Grad Max: 3.897186 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.007049 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004907 | Grad Max: 0.004907 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006068 | Grad Max: 0.574206 -> Layer: exit2_layers.0.bias | Grad Mean: 0.113869 | Grad Max: 3.114107 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001120 | Grad Max: 0.031238 -> Layer: exit2_layers.3.bias | Grad Mean: 0.054522 | Grad Max: 0.236907 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000145 | Grad Max: 0.001609 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012016 | Grad Max: 0.024755 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000064 | Grad Max: 0.000765 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003536 | Grad Max: 0.008290 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004696 | Grad Max: 0.008469 -> Layer: exit2_layers.12.bias | Grad Mean: 0.085741 | Grad Max: 0.085741 [GRADIENT NORM TOTAL] 18.0800 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.500 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.508697 0.49130306] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 619/1429 | B: 519/1529 | C: 289/1759 [LOSS Ex1] A: 0.65890 | B: 0.65642 | C: 0.65418 [LOGITS Ex2 A] Mean Abs: 1.764 | Max: 6.156 [LOSS Ex2] A: 0.19967 | B: 0.42156 | C: 0.31591 ** [JOINT LOSS] ** : 0.968877 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011543 | Grad Max: 0.289520 -> Layer: shared_layers.0.bias | Grad Mean: 0.796664 | Grad Max: 3.239415 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.007668 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004845 | Grad Max: 0.004845 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005074 | Grad Max: 0.504180 -> Layer: exit2_layers.0.bias | Grad Mean: 0.095532 | Grad Max: 2.736287 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000965 | Grad Max: 0.030094 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047113 | Grad Max: 0.225837 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000125 | Grad Max: 0.001411 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010334 | Grad Max: 0.021575 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.000614 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003040 | Grad Max: 0.007187 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003948 | Grad Max: 0.006641 -> Layer: exit2_layers.12.bias | Grad Mean: 0.073448 | Grad Max: 0.073448 [GRADIENT NORM TOTAL] 15.4275 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.515 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015361 0.4984639] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 628/1420 | B: 483/1373 | C: 276/1772 [LOSS Ex1] A: 0.65792 | B: 0.65972 | C: 0.65327 [LOGITS Ex2 A] Mean Abs: 1.732 | Max: 6.952 [LOSS Ex2] A: 0.20036 | B: 0.38018 | C: 0.29294 ** [JOINT LOSS] ** : 0.948130 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005506 | Grad Max: 0.138064 -> Layer: shared_layers.0.bias | Grad Mean: 0.347541 | Grad Max: 1.291861 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002250 | Grad Max: 0.008143 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006498 | Grad Max: 0.006498 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.239634 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041751 | Grad Max: 1.306722 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000421 | Grad Max: 0.012831 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020497 | Grad Max: 0.097711 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000645 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004561 | Grad Max: 0.009266 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000291 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001356 | Grad Max: 0.003154 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001836 | Grad Max: 0.003827 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034123 | Grad Max: 0.034123 [GRADIENT NORM TOTAL] 6.7513 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.432 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040813 0.49591866] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 600/1448 | B: 503/1545 | C: 263/1785 [LOSS Ex1] A: 0.66371 | B: 0.65902 | C: 0.65482 [LOGITS Ex2 A] Mean Abs: 1.661 | Max: 6.650 [LOSS Ex2] A: 0.18614 | B: 0.41432 | C: 0.28759 ** [JOINT LOSS] ** : 0.955203 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006753 | Grad Max: 0.185751 -> Layer: shared_layers.0.bias | Grad Mean: 0.375633 | Grad Max: 1.607423 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.005886 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002350 | Grad Max: 0.002350 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002568 | Grad Max: 0.297751 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047903 | Grad Max: 1.683822 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000463 | Grad Max: 0.012822 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022436 | Grad Max: 0.095253 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000706 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004995 | Grad Max: 0.010950 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000329 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001468 | Grad Max: 0.003554 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001967 | Grad Max: 0.003550 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035558 | Grad Max: 0.035558 [GRADIENT NORM TOTAL] 7.6412 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.079 | Max: 0.359 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53137517 0.4686249 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.534 | Std: 0.041 [MASKS] A(Pass/Fail): 595/1453 | B: 519/1529 | C: 278/1770 [LOSS Ex1] A: 0.66346 | B: 0.65969 | C: 0.65344 [LOGITS Ex2 A] Mean Abs: 1.596 | Max: 6.907 [LOSS Ex2] A: 0.20244 | B: 0.43500 | C: 0.32132 ** [JOINT LOSS] ** : 0.978450 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011227 | Grad Max: 0.286037 -> Layer: shared_layers.0.bias | Grad Mean: 0.695478 | Grad Max: 2.929729 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.006518 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002779 | Grad Max: 0.002779 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004551 | Grad Max: 0.550032 -> Layer: exit2_layers.0.bias | Grad Mean: 0.085066 | Grad Max: 2.994334 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000831 | Grad Max: 0.023480 -> Layer: exit2_layers.3.bias | Grad Mean: 0.040473 | Grad Max: 0.177985 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000108 | Grad Max: 0.001145 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009010 | Grad Max: 0.018508 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000530 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002665 | Grad Max: 0.005935 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003428 | Grad Max: 0.006562 -> Layer: exit2_layers.12.bias | Grad Mean: 0.064918 | Grad Max: 0.064918 [GRADIENT NORM TOTAL] 13.6335 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.493 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6457763 0.35422373] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.043 [MASKS] A(Pass/Fail): 665/1383 | B: 520/1528 | C: 286/1762 [LOSS Ex1] A: 0.65951 | B: 0.65633 | C: 0.65183 [LOGITS Ex2 A] Mean Abs: 1.657 | Max: 5.729 [LOSS Ex2] A: 0.19275 | B: 0.40719 | C: 0.30189 ** [JOINT LOSS] ** : 0.956497 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010744 | Grad Max: 0.254590 -> Layer: shared_layers.0.bias | Grad Mean: 0.611231 | Grad Max: 2.507270 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002327 | Grad Max: 0.007695 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007409 | Grad Max: 0.007409 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004114 | Grad Max: 0.454392 -> Layer: exit2_layers.0.bias | Grad Mean: 0.076652 | Grad Max: 2.529959 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000751 | Grad Max: 0.021763 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036560 | Grad Max: 0.158557 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000098 | Grad Max: 0.000999 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008149 | Grad Max: 0.016390 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000505 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002389 | Grad Max: 0.005931 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003021 | Grad Max: 0.005466 -> Layer: exit2_layers.12.bias | Grad Mean: 0.056584 | Grad Max: 0.056584 [GRADIENT NORM TOTAL] 12.0100 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.554 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003772 0.49962285] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.042 [MASKS] A(Pass/Fail): 637/1411 | B: 484/1372 | C: 295/1753 [LOSS Ex1] A: 0.66413 | B: 0.65964 | C: 0.65299 [LOGITS Ex2 A] Mean Abs: 1.695 | Max: 5.589 [LOSS Ex2] A: 0.17422 | B: 0.37269 | C: 0.29734 ** [JOINT LOSS] ** : 0.940336 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005714 | Grad Max: 0.199589 -> Layer: shared_layers.0.bias | Grad Mean: 0.251036 | Grad Max: 1.119328 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002088 | Grad Max: 0.006745 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005731 | Grad Max: 0.005731 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001761 | Grad Max: 0.300203 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031810 | Grad Max: 1.686210 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000295 | Grad Max: 0.010782 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014146 | Grad Max: 0.073461 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000482 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003214 | Grad Max: 0.007041 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000243 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000954 | Grad Max: 0.002535 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001252 | Grad Max: 0.002952 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022962 | Grad Max: 0.022962 [GRADIENT NORM TOTAL] 5.3986 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.351 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62007815 0.37992182] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 620/1428 | B: 503/1545 | C: 255/1793 [LOSS Ex1] A: 0.66148 | B: 0.65893 | C: 0.65766 [LOGITS Ex2 A] Mean Abs: 1.723 | Max: 5.419 [LOSS Ex2] A: 0.19513 | B: 0.39762 | C: 0.30536 ** [JOINT LOSS] ** : 0.958726 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003993 | Grad Max: 0.113352 -> Layer: shared_layers.0.bias | Grad Mean: 0.347431 | Grad Max: 1.402507 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002000 | Grad Max: 0.006493 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001657 | Grad Max: 0.001657 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.229580 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041441 | Grad Max: 1.299487 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000415 | Grad Max: 0.013933 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020458 | Grad Max: 0.103260 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000640 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004481 | Grad Max: 0.009906 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000301 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001326 | Grad Max: 0.003113 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001642 | Grad Max: 0.003394 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032586 | Grad Max: 0.032586 [GRADIENT NORM TOTAL] 6.8495 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.427 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57596165 0.42403835] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.043 [MASKS] A(Pass/Fail): 509/1107 | B: 519/1529 | C: 246/1802 [LOSS Ex1] A: 0.66025 | B: 0.65960 | C: 0.65826 [LOGITS Ex2 A] Mean Abs: 1.775 | Max: 5.709 [LOSS Ex2] A: 0.19314 | B: 0.41856 | C: 0.33451 ** [JOINT LOSS] ** : 0.974778 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006935 | Grad Max: 0.221296 -> Layer: shared_layers.0.bias | Grad Mean: 0.627665 | Grad Max: 2.879717 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002019 | Grad Max: 0.006633 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000852 | Grad Max: 0.000852 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003938 | Grad Max: 0.380767 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074655 | Grad Max: 2.126667 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000747 | Grad Max: 0.022521 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036793 | Grad Max: 0.170642 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.001023 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008069 | Grad Max: 0.016022 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000508 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002391 | Grad Max: 0.005689 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003041 | Grad Max: 0.005037 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057661 | Grad Max: 0.057661 [GRADIENT NORM TOTAL] 12.3411 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.555 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061742 0.4938258] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.043 [MASKS] A(Pass/Fail): 644/1404 | B: 521/1527 | C: 298/1750 [LOSS Ex1] A: 0.66049 | B: 0.65624 | C: 0.65395 [LOGITS Ex2 A] Mean Abs: 1.741 | Max: 5.844 [LOSS Ex2] A: 0.19411 | B: 0.39667 | C: 0.33113 ** [JOINT LOSS] ** : 0.964196 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007199 | Grad Max: 0.177909 -> Layer: shared_layers.0.bias | Grad Mean: 0.530358 | Grad Max: 2.375537 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.007133 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004040 | Grad Max: 0.004040 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003396 | Grad Max: 0.342870 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064011 | Grad Max: 1.864028 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000632 | Grad Max: 0.020024 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030936 | Grad Max: 0.144832 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000886 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006812 | Grad Max: 0.013948 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000404 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002019 | Grad Max: 0.004631 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002564 | Grad Max: 0.004694 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048498 | Grad Max: 0.048498 [GRADIENT NORM TOTAL] 10.4090 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.503 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5087496 0.4912504] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 620/1428 | B: 486/1370 | C: 190/1186 [LOSS Ex1] A: 0.65872 | B: 0.65955 | C: 0.65347 [LOGITS Ex2 A] Mean Abs: 1.707 | Max: 5.696 [LOSS Ex2] A: 0.17446 | B: 0.36599 | C: 0.29955 ** [JOINT LOSS] ** : 0.937249 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002663 | Grad Max: 0.065839 -> Layer: shared_layers.0.bias | Grad Mean: 0.051692 | Grad Max: 0.241881 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.007526 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003026 | Grad Max: 0.003026 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000497 | Grad Max: 0.092256 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008381 | Grad Max: 0.520814 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.004689 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002677 | Grad Max: 0.020695 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000175 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000615 | Grad Max: 0.002438 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000099 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000188 | Grad Max: 0.000892 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000562 | Grad Max: 0.001581 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004982 | Grad Max: 0.004982 [GRADIENT NORM TOTAL] 1.5265 [EPOCH SUMMARY] Train Loss: 0.9605 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9404 | Alpha: 0.5500 No improve count: 3/15 ############################## EPOCH 77/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.518 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50158155 0.49841845] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 629/1419 | B: 506/1542 | C: 258/1790 [LOSS Ex1] A: 0.65773 | B: 0.65885 | C: 0.65454 [LOGITS Ex2 A] Mean Abs: 1.649 | Max: 6.326 [LOSS Ex2] A: 0.20595 | B: 0.43135 | C: 0.30266 ** [JOINT LOSS] ** : 0.970356 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006368 | Grad Max: 0.159190 -> Layer: shared_layers.0.bias | Grad Mean: 0.494131 | Grad Max: 2.092587 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002236 | Grad Max: 0.007461 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002225 | Grad Max: 0.002225 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003099 | Grad Max: 0.366914 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058439 | Grad Max: 2.063640 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000567 | Grad Max: 0.018361 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027897 | Grad Max: 0.130908 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000797 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006114 | Grad Max: 0.012796 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000376 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001810 | Grad Max: 0.004156 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002302 | Grad Max: 0.004390 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043933 | Grad Max: 0.043933 [GRADIENT NORM TOTAL] 9.5870 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.434 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040698 0.4959302] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 601/1447 | B: 519/1529 | C: 269/1779 [LOSS Ex1] A: 0.66355 | B: 0.65951 | C: 0.65506 [LOGITS Ex2 A] Mean Abs: 1.597 | Max: 5.624 [LOSS Ex2] A: 0.18634 | B: 0.43646 | C: 0.31715 ** [JOINT LOSS] ** : 0.972695 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009568 | Grad Max: 0.226326 -> Layer: shared_layers.0.bias | Grad Mean: 0.667073 | Grad Max: 2.783952 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.006054 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002685 | Grad Max: 0.002685 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004266 | Grad Max: 0.434856 -> Layer: exit2_layers.0.bias | Grad Mean: 0.080497 | Grad Max: 2.372323 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000801 | Grad Max: 0.025499 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039358 | Grad Max: 0.192007 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001112 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008690 | Grad Max: 0.018427 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000045 | Grad Max: 0.000548 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002566 | Grad Max: 0.006073 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003245 | Grad Max: 0.006172 -> Layer: exit2_layers.12.bias | Grad Mean: 0.061642 | Grad Max: 0.061642 [GRADIENT NORM TOTAL] 12.8492 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.361 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53153455 0.46846545] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.534 | Std: 0.041 [MASKS] A(Pass/Fail): 595/1453 | B: 521/1527 | C: 270/1778 [LOSS Ex1] A: 0.66330 | B: 0.65614 | C: 0.65448 [LOGITS Ex2 A] Mean Abs: 1.602 | Max: 5.552 [LOSS Ex2] A: 0.19815 | B: 0.40373 | C: 0.31166 ** [JOINT LOSS] ** : 0.962482 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008435 | Grad Max: 0.202870 -> Layer: shared_layers.0.bias | Grad Mean: 0.502927 | Grad Max: 2.160730 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.006475 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002889 | Grad Max: 0.002889 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003221 | Grad Max: 0.352473 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060652 | Grad Max: 1.941685 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000612 | Grad Max: 0.017428 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029892 | Grad Max: 0.129145 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000919 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006614 | Grad Max: 0.014140 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000435 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001957 | Grad Max: 0.004901 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002537 | Grad Max: 0.004560 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046971 | Grad Max: 0.046971 [GRADIENT NORM TOTAL] 9.5342 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.496 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6467272 0.35327277] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.043 [MASKS] A(Pass/Fail): 666/1382 | B: 487/1369 | C: 299/1749 [LOSS Ex1] A: 0.65932 | B: 0.65945 | C: 0.65157 [LOGITS Ex2 A] Mean Abs: 1.686 | Max: 5.826 [LOSS Ex2] A: 0.17760 | B: 0.36299 | C: 0.31092 ** [JOINT LOSS] ** : 0.940614 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001662 | Grad Max: 0.043027 -> Layer: shared_layers.0.bias | Grad Mean: 0.052840 | Grad Max: 0.228319 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002224 | Grad Max: 0.007701 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007218 | Grad Max: 0.007218 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000456 | Grad Max: 0.072409 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007948 | Grad Max: 0.391340 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000064 | Grad Max: 0.003280 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002765 | Grad Max: 0.020500 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000224 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000555 | Grad Max: 0.002718 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000095 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000156 | Grad Max: 0.000636 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000467 | Grad Max: 0.001486 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003531 | Grad Max: 0.003531 [GRADIENT NORM TOTAL] 1.3861 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.557 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004179 0.49958214] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 638/1410 | B: 506/1542 | C: 278/1770 [LOSS Ex1] A: 0.66396 | B: 0.65874 | C: 0.65284 [LOGITS Ex2 A] Mean Abs: 1.744 | Max: 5.326 [LOSS Ex2] A: 0.18184 | B: 0.41925 | C: 0.33741 ** [JOINT LOSS] ** : 0.971345 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007682 | Grad Max: 0.197931 -> Layer: shared_layers.0.bias | Grad Mean: 0.626019 | Grad Max: 2.501388 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.006042 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000189 | Grad Max: 0.000189 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003940 | Grad Max: 0.369041 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074705 | Grad Max: 2.074791 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000756 | Grad Max: 0.024334 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037276 | Grad Max: 0.184800 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001105 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008164 | Grad Max: 0.017521 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000509 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002402 | Grad Max: 0.005532 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002950 | Grad Max: 0.005171 -> Layer: exit2_layers.12.bias | Grad Mean: 0.056426 | Grad Max: 0.056426 [GRADIENT NORM TOTAL] 12.0735 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.354 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6208825 0.37911752] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 620/1428 | B: 520/1528 | C: 278/1770 [LOSS Ex1] A: 0.66128 | B: 0.65939 | C: 0.65583 [LOGITS Ex2 A] Mean Abs: 1.757 | Max: 5.248 [LOSS Ex2] A: 0.21473 | B: 0.44576 | C: 0.33553 ** [JOINT LOSS] ** : 0.990844 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010248 | Grad Max: 0.253007 -> Layer: shared_layers.0.bias | Grad Mean: 0.840399 | Grad Max: 3.459207 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002027 | Grad Max: 0.006545 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002185 | Grad Max: 0.002185 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005239 | Grad Max: 0.530590 -> Layer: exit2_layers.0.bias | Grad Mean: 0.098656 | Grad Max: 3.000314 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000990 | Grad Max: 0.031657 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048980 | Grad Max: 0.238478 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000127 | Grad Max: 0.001460 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010787 | Grad Max: 0.022586 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.000669 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003206 | Grad Max: 0.007412 -> Layer: exit2_layers.12.weight | Grad Mean: 0.004189 | Grad Max: 0.007656 -> Layer: exit2_layers.12.bias | Grad Mean: 0.077485 | Grad Max: 0.077485 [GRADIENT NORM TOTAL] 16.3654 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.430 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5764949 0.42350516] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.043 [MASKS] A(Pass/Fail): 509/1107 | B: 522/1526 | C: 289/1759 [LOSS Ex1] A: 0.66005 | B: 0.65601 | C: 0.65127 [LOGITS Ex2 A] Mean Abs: 1.800 | Max: 6.356 [LOSS Ex2] A: 0.19830 | B: 0.41623 | C: 0.30846 ** [JOINT LOSS] ** : 0.963443 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007875 | Grad Max: 0.239320 -> Layer: shared_layers.0.bias | Grad Mean: 0.725619 | Grad Max: 3.105739 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002230 | Grad Max: 0.007230 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006899 | Grad Max: 0.006899 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004418 | Grad Max: 0.464533 -> Layer: exit2_layers.0.bias | Grad Mean: 0.083639 | Grad Max: 2.583216 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000848 | Grad Max: 0.030269 -> Layer: exit2_layers.3.bias | Grad Mean: 0.042083 | Grad Max: 0.215587 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000109 | Grad Max: 0.001218 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009246 | Grad Max: 0.018792 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000579 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002738 | Grad Max: 0.006525 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003415 | Grad Max: 0.005775 -> Layer: exit2_layers.12.bias | Grad Mean: 0.065069 | Grad Max: 0.065069 [GRADIENT NORM TOTAL] 13.9772 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.558 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061977 0.4938023] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.044 [MASKS] A(Pass/Fail): 644/1404 | B: 489/1367 | C: 266/1782 [LOSS Ex1] A: 0.66028 | B: 0.65933 | C: 0.65509 [LOGITS Ex2 A] Mean Abs: 1.745 | Max: 6.127 [LOSS Ex2] A: 0.18292 | B: 0.37559 | C: 0.30535 ** [JOINT LOSS] ** : 0.946187 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003820 | Grad Max: 0.123193 -> Layer: shared_layers.0.bias | Grad Mean: 0.340883 | Grad Max: 1.366789 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.007477 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010207 | Grad Max: 0.010207 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.213513 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038901 | Grad Max: 1.209005 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000412 | Grad Max: 0.014977 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020233 | Grad Max: 0.110579 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000617 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004379 | Grad Max: 0.008962 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000274 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001312 | Grad Max: 0.003087 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001654 | Grad Max: 0.003496 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031776 | Grad Max: 0.031776 [GRADIENT NORM TOTAL] 6.4645 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.507 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5088226 0.4911774] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.043 [MASKS] A(Pass/Fail): 621/1427 | B: 507/1541 | C: 279/1769 [LOSS Ex1] A: 0.65849 | B: 0.65862 | C: 0.65335 [LOGITS Ex2 A] Mean Abs: 1.682 | Max: 5.613 [LOSS Ex2] A: 0.18409 | B: 0.41492 | C: 0.31056 ** [JOINT LOSS] ** : 0.960010 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005482 | Grad Max: 0.148315 -> Layer: shared_layers.0.bias | Grad Mean: 0.412188 | Grad Max: 1.834075 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002321 | Grad Max: 0.008382 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011319 | Grad Max: 0.011319 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002662 | Grad Max: 0.417309 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049662 | Grad Max: 2.345150 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000471 | Grad Max: 0.015661 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023155 | Grad Max: 0.108312 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000651 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005162 | Grad Max: 0.010679 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000332 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001536 | Grad Max: 0.003785 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001915 | Grad Max: 0.003634 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036007 | Grad Max: 0.036007 [GRADIENT NORM TOTAL] 8.5870 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.522 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015819 0.49841812] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.043 [MASKS] A(Pass/Fail): 630/1418 | B: 522/1526 | C: 266/1782 [LOSS Ex1] A: 0.65751 | B: 0.65929 | C: 0.65559 [LOGITS Ex2 A] Mean Abs: 1.635 | Max: 7.033 [LOSS Ex2] A: 0.19972 | B: 0.43016 | C: 0.31834 ** [JOINT LOSS] ** : 0.973532 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010211 | Grad Max: 0.246863 -> Layer: shared_layers.0.bias | Grad Mean: 0.741150 | Grad Max: 3.030818 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.007673 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002843 | Grad Max: 0.002843 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004761 | Grad Max: 0.552221 -> Layer: exit2_layers.0.bias | Grad Mean: 0.090088 | Grad Max: 3.059556 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000891 | Grad Max: 0.025051 -> Layer: exit2_layers.3.bias | Grad Mean: 0.044063 | Grad Max: 0.189829 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000116 | Grad Max: 0.001342 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009768 | Grad Max: 0.020651 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000623 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002905 | Grad Max: 0.006954 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003720 | Grad Max: 0.007017 -> Layer: exit2_layers.12.bias | Grad Mean: 0.070417 | Grad Max: 0.070417 [GRADIENT NORM TOTAL] 14.6455 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.437 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50400954 0.49599043] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.043 [MASKS] A(Pass/Fail): 602/1446 | B: 523/1525 | C: 261/1787 [LOSS Ex1] A: 0.66337 | B: 0.65590 | C: 0.65756 [LOGITS Ex2 A] Mean Abs: 1.619 | Max: 6.258 [LOSS Ex2] A: 0.19459 | B: 0.41948 | C: 0.31282 ** [JOINT LOSS] ** : 0.967908 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009789 | Grad Max: 0.264185 -> Layer: shared_layers.0.bias | Grad Mean: 0.669376 | Grad Max: 2.738951 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.006058 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000509 | Grad Max: 0.000509 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004382 | Grad Max: 0.530105 -> Layer: exit2_layers.0.bias | Grad Mean: 0.082452 | Grad Max: 2.954613 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000808 | Grad Max: 0.024109 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039877 | Grad Max: 0.182125 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000105 | Grad Max: 0.001231 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008874 | Grad Max: 0.018222 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000569 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002629 | Grad Max: 0.006257 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003343 | Grad Max: 0.006017 -> Layer: exit2_layers.12.bias | Grad Mean: 0.062988 | Grad Max: 0.062988 [GRADIENT NORM TOTAL] 13.1925 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.080 | Max: 0.364 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53175616 0.46824378] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 600/1448 | B: 489/1367 | C: 279/1769 [LOSS Ex1] A: 0.66312 | B: 0.65922 | C: 0.65554 [LOGITS Ex2 A] Mean Abs: 1.627 | Max: 5.553 [LOSS Ex2] A: 0.18806 | B: 0.37940 | C: 0.29645 ** [JOINT LOSS] ** : 0.947268 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005722 | Grad Max: 0.171567 -> Layer: shared_layers.0.bias | Grad Mean: 0.386305 | Grad Max: 1.575985 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002007 | Grad Max: 0.006753 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008523 | Grad Max: 0.008523 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002507 | Grad Max: 0.312456 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046596 | Grad Max: 1.770070 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000470 | Grad Max: 0.014699 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023147 | Grad Max: 0.110083 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000735 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005134 | Grad Max: 0.011671 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000337 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001511 | Grad Max: 0.003678 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001826 | Grad Max: 0.003784 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034802 | Grad Max: 0.034802 [GRADIENT NORM TOTAL] 7.6180 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.499 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64783424 0.35216576] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.538 | Std: 0.044 [MASKS] A(Pass/Fail): 669/1379 | B: 507/1541 | C: 272/1776 [LOSS Ex1] A: 0.65912 | B: 0.65852 | C: 0.65373 [LOGITS Ex2 A] Mean Abs: 1.734 | Max: 6.611 [LOSS Ex2] A: 0.17755 | B: 0.39490 | C: 0.29574 ** [JOINT LOSS] ** : 0.946518 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004812 | Grad Max: 0.122245 -> Layer: shared_layers.0.bias | Grad Mean: 0.239727 | Grad Max: 1.148506 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.007506 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008350 | Grad Max: 0.008350 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001579 | Grad Max: 0.153589 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029319 | Grad Max: 0.820261 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.009712 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014165 | Grad Max: 0.073763 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000528 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003177 | Grad Max: 0.007571 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000226 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000946 | Grad Max: 0.002157 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001200 | Grad Max: 0.003076 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023199 | Grad Max: 0.023199 [GRADIENT NORM TOTAL] 4.6084 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.560 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500487 0.499513] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 644/1404 | B: 522/1526 | C: 179/1197 [LOSS Ex1] A: 0.66379 | B: 0.65919 | C: 0.65457 [LOGITS Ex2 A] Mean Abs: 1.741 | Max: 5.586 [LOSS Ex2] A: 0.18591 | B: 0.40150 | C: 0.33568 ** [JOINT LOSS] ** : 0.966876 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007214 | Grad Max: 0.170176 -> Layer: shared_layers.0.bias | Grad Mean: 0.487438 | Grad Max: 2.268451 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.006209 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002696 | Grad Max: 0.002696 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003078 | Grad Max: 0.293656 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058063 | Grad Max: 1.604393 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000568 | Grad Max: 0.019736 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028085 | Grad Max: 0.145123 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000898 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006201 | Grad Max: 0.013845 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000416 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001844 | Grad Max: 0.004397 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002367 | Grad Max: 0.004209 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044261 | Grad Max: 0.044261 [GRADIENT NORM TOTAL] 9.4990 [EPOCH SUMMARY] Train Loss: 0.9629 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9368 | Alpha: 0.5500 No improve count: 4/15 ############################## EPOCH 78/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.356 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62171084 0.37828916] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 623/1425 | B: 524/1524 | C: 256/1792 [LOSS Ex1] A: 0.66110 | B: 0.65580 | C: 0.65545 [LOGITS Ex2 A] Mean Abs: 1.723 | Max: 5.613 [LOSS Ex2] A: 0.19033 | B: 0.38589 | C: 0.29578 ** [JOINT LOSS] ** : 0.948115 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006113 | Grad Max: 0.143883 -> Layer: shared_layers.0.bias | Grad Mean: 0.389257 | Grad Max: 1.704046 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.007009 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006667 | Grad Max: 0.006667 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002479 | Grad Max: 0.258095 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046315 | Grad Max: 1.391597 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000450 | Grad Max: 0.015012 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022176 | Grad Max: 0.110390 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000696 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004930 | Grad Max: 0.010262 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000291 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001474 | Grad Max: 0.003348 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001876 | Grad Max: 0.004056 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035714 | Grad Max: 0.035714 [GRADIENT NORM TOTAL] 7.5601 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.432 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57700163 0.42299837] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.044 [MASKS] A(Pass/Fail): 510/1106 | B: 490/1366 | C: 266/1782 [LOSS Ex1] A: 0.65986 | B: 0.65913 | C: 0.65420 [LOGITS Ex2 A] Mean Abs: 1.722 | Max: 5.876 [LOSS Ex2] A: 0.17384 | B: 0.37484 | C: 0.29398 ** [JOINT LOSS] ** : 0.938614 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001672 | Grad Max: 0.024073 -> Layer: shared_layers.0.bias | Grad Mean: 0.079891 | Grad Max: 0.312055 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.006905 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001533 | Grad Max: 0.001533 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000673 | Grad Max: 0.105138 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011952 | Grad Max: 0.583209 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000102 | Grad Max: 0.004171 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004845 | Grad Max: 0.029090 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000292 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001077 | Grad Max: 0.003361 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000097 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000318 | Grad Max: 0.000912 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000529 | Grad Max: 0.001728 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007362 | Grad Max: 0.007362 [GRADIENT NORM TOTAL] 1.9281 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.561 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.506181 0.49381894] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.044 [MASKS] A(Pass/Fail): 646/1402 | B: 508/1540 | C: 305/1743 [LOSS Ex1] A: 0.66010 | B: 0.65842 | C: 0.65230 [LOGITS Ex2 A] Mean Abs: 1.694 | Max: 6.213 [LOSS Ex2] A: 0.17284 | B: 0.40449 | C: 0.30650 ** [JOINT LOSS] ** : 0.951545 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002521 | Grad Max: 0.073056 -> Layer: shared_layers.0.bias | Grad Mean: 0.151931 | Grad Max: 0.687813 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.006952 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002913 | Grad Max: 0.002913 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000990 | Grad Max: 0.131348 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017977 | Grad Max: 0.741198 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000188 | Grad Max: 0.006936 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009293 | Grad Max: 0.050312 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000384 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002080 | Grad Max: 0.005514 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000159 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000628 | Grad Max: 0.001661 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000923 | Grad Max: 0.002564 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015836 | Grad Max: 0.015836 [GRADIENT NORM TOTAL] 2.9648 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.509 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5089145 0.4910855] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.044 [MASKS] A(Pass/Fail): 628/1420 | B: 522/1526 | C: 277/1771 [LOSS Ex1] A: 0.65829 | B: 0.65908 | C: 0.65377 [LOGITS Ex2 A] Mean Abs: 1.710 | Max: 5.427 [LOSS Ex2] A: 0.17717 | B: 0.39666 | C: 0.29575 ** [JOINT LOSS] ** : 0.946906 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003172 | Grad Max: 0.091576 -> Layer: shared_layers.0.bias | Grad Mean: 0.094382 | Grad Max: 0.384567 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.007683 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005887 | Grad Max: 0.005887 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000718 | Grad Max: 0.132072 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012805 | Grad Max: 0.704798 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.003787 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005119 | Grad Max: 0.022268 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000261 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001155 | Grad Max: 0.003461 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000107 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000334 | Grad Max: 0.001010 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000452 | Grad Max: 0.001606 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007132 | Grad Max: 0.007132 [GRADIENT NORM TOTAL] 2.1391 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.525 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50156283 0.49843723] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.043 [MASKS] A(Pass/Fail): 634/1414 | B: 524/1524 | C: 272/1776 [LOSS Ex1] A: 0.65730 | B: 0.65567 | C: 0.65338 [LOGITS Ex2 A] Mean Abs: 1.701 | Max: 6.728 [LOSS Ex2] A: 0.19324 | B: 0.38520 | C: 0.29293 ** [JOINT LOSS] ** : 0.945906 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001968 | Grad Max: 0.063630 -> Layer: shared_layers.0.bias | Grad Mean: 0.049847 | Grad Max: 0.246486 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002251 | Grad Max: 0.007297 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001312 | Grad Max: 0.001312 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000522 | Grad Max: 0.100372 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008438 | Grad Max: 0.530700 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.002433 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002394 | Grad Max: 0.017983 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000160 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000552 | Grad Max: 0.002223 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000092 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000175 | Grad Max: 0.000713 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000446 | Grad Max: 0.001348 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004291 | Grad Max: 0.004291 [GRADIENT NORM TOTAL] 1.4414 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.440 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.503967 0.496033] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.535 | Std: 0.043 [MASKS] A(Pass/Fail): 605/1443 | B: 491/1365 | C: 278/1770 [LOSS Ex1] A: 0.66317 | B: 0.65899 | C: 0.65436 [LOGITS Ex2 A] Mean Abs: 1.653 | Max: 6.154 [LOSS Ex2] A: 0.17732 | B: 0.37634 | C: 0.30652 ** [JOINT LOSS] ** : 0.945565 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004364 | Grad Max: 0.157970 -> Layer: shared_layers.0.bias | Grad Mean: 0.223747 | Grad Max: 0.855146 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.006169 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000255 | Grad Max: 0.000255 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001606 | Grad Max: 0.224761 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029733 | Grad Max: 1.264964 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000276 | Grad Max: 0.009864 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013412 | Grad Max: 0.064804 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000464 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003015 | Grad Max: 0.006691 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000211 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000882 | Grad Max: 0.002272 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001123 | Grad Max: 0.002665 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020368 | Grad Max: 0.020368 [GRADIENT NORM TOTAL] 4.6059 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.081 | Max: 0.368 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.532057 0.46794304] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 601/1447 | B: 510/1538 | C: 253/1795 [LOSS Ex1] A: 0.66291 | B: 0.65826 | C: 0.65439 [LOGITS Ex2 A] Mean Abs: 1.658 | Max: 6.412 [LOSS Ex2] A: 0.18077 | B: 0.39455 | C: 0.29404 ** [JOINT LOSS] ** : 0.948305 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004311 | Grad Max: 0.156626 -> Layer: shared_layers.0.bias | Grad Mean: 0.106630 | Grad Max: 0.433461 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.006174 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006361 | Grad Max: 0.006361 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000863 | Grad Max: 0.215841 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014964 | Grad Max: 1.212016 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000131 | Grad Max: 0.004129 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006050 | Grad Max: 0.024584 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000264 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001424 | Grad Max: 0.003873 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000138 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000427 | Grad Max: 0.001254 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000638 | Grad Max: 0.001867 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010135 | Grad Max: 0.010135 [GRADIENT NORM TOTAL] 2.5360 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.502 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64932144 0.3506786 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.538 | Std: 0.044 [MASKS] A(Pass/Fail): 670/1378 | B: 524/1524 | C: 273/1775 [LOSS Ex1] A: 0.65885 | B: 0.65891 | C: 0.65416 [LOGITS Ex2 A] Mean Abs: 1.722 | Max: 6.391 [LOSS Ex2] A: 0.16983 | B: 0.39835 | C: 0.32468 ** [JOINT LOSS] ** : 0.954930 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003122 | Grad Max: 0.122284 -> Layer: shared_layers.0.bias | Grad Mean: 0.336169 | Grad Max: 1.583581 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002214 | Grad Max: 0.007469 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008245 | Grad Max: 0.008245 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.225895 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039828 | Grad Max: 1.273723 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000396 | Grad Max: 0.012061 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019730 | Grad Max: 0.086214 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000635 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004306 | Grad Max: 0.009279 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000258 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001260 | Grad Max: 0.002893 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001536 | Grad Max: 0.003230 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029419 | Grad Max: 0.029419 [GRADIENT NORM TOTAL] 6.9280 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.564 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005332 0.49946684] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.043 [MASKS] A(Pass/Fail): 647/1401 | B: 526/1522 | C: 269/1779 [LOSS Ex1] A: 0.66354 | B: 0.65549 | C: 0.65592 [LOGITS Ex2 A] Mean Abs: 1.740 | Max: 6.005 [LOSS Ex2] A: 0.17150 | B: 0.38376 | C: 0.30887 ** [JOINT LOSS] ** : 0.946362 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003748 | Grad Max: 0.138263 -> Layer: shared_layers.0.bias | Grad Mean: 0.379119 | Grad Max: 1.830999 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.006139 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000790 | Grad Max: 0.000790 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002391 | Grad Max: 0.236937 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044721 | Grad Max: 1.310992 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000443 | Grad Max: 0.014002 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022033 | Grad Max: 0.098024 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000712 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004801 | Grad Max: 0.010216 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000305 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001409 | Grad Max: 0.003414 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001746 | Grad Max: 0.003781 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033359 | Grad Max: 0.033359 [GRADIENT NORM TOTAL] 7.6388 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.360 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6230212 0.37697875] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.044 [MASKS] A(Pass/Fail): 626/1422 | B: 493/1363 | C: 270/1778 [LOSS Ex1] A: 0.66081 | B: 0.65882 | C: 0.65502 [LOGITS Ex2 A] Mean Abs: 1.720 | Max: 6.702 [LOSS Ex2] A: 0.18829 | B: 0.37044 | C: 0.31236 ** [JOINT LOSS] ** : 0.948583 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001938 | Grad Max: 0.046554 -> Layer: shared_layers.0.bias | Grad Mean: 0.063250 | Grad Max: 0.306866 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.006567 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004539 | Grad Max: 0.004539 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000544 | Grad Max: 0.125344 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009021 | Grad Max: 0.700514 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.003183 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001804 | Grad Max: 0.016240 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000174 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000327 | Grad Max: 0.001761 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000078 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000108 | Grad Max: 0.000663 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000384 | Grad Max: 0.001145 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002938 | Grad Max: 0.002938 [GRADIENT NORM TOTAL] 1.7795 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.436 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5779116 0.42208838] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.538 | Std: 0.044 [MASKS] A(Pass/Fail): 513/1103 | B: 512/1536 | C: 285/1763 [LOSS Ex1] A: 0.65954 | B: 0.65809 | C: 0.65202 [LOGITS Ex2 A] Mean Abs: 1.726 | Max: 5.374 [LOSS Ex2] A: 0.17224 | B: 0.40595 | C: 0.29433 ** [JOINT LOSS] ** : 0.947390 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004911 | Grad Max: 0.115427 -> Layer: shared_layers.0.bias | Grad Mean: 0.328948 | Grad Max: 1.366957 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.006829 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001876 | Grad Max: 0.001876 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.283182 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039069 | Grad Max: 1.588446 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000369 | Grad Max: 0.011198 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018242 | Grad Max: 0.084266 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000627 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004041 | Grad Max: 0.008989 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000256 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001199 | Grad Max: 0.002748 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001650 | Grad Max: 0.003315 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029446 | Grad Max: 0.029446 [GRADIENT NORM TOTAL] 6.7808 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.566 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.506221 0.49377906] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.538 | Std: 0.044 [MASKS] A(Pass/Fail): 648/1400 | B: 526/1522 | C: 302/1746 [LOSS Ex1] A: 0.65977 | B: 0.65875 | C: 0.65000 [LOGITS Ex2 A] Mean Abs: 1.711 | Max: 5.904 [LOSS Ex2] A: 0.17493 | B: 0.40569 | C: 0.28381 ** [JOINT LOSS] ** : 0.944319 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003738 | Grad Max: 0.107973 -> Layer: shared_layers.0.bias | Grad Mean: 0.306941 | Grad Max: 1.478740 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002244 | Grad Max: 0.007533 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007567 | Grad Max: 0.007567 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001934 | Grad Max: 0.263368 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036198 | Grad Max: 1.476573 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000335 | Grad Max: 0.012408 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016639 | Grad Max: 0.089099 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000536 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003601 | Grad Max: 0.008065 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000245 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001049 | Grad Max: 0.002650 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001357 | Grad Max: 0.002913 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024802 | Grad Max: 0.024802 [GRADIENT NORM TOTAL] 6.3175 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.515 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5090646 0.49093536] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.044 [MASKS] A(Pass/Fail): 630/1418 | B: 529/1519 | C: 263/1785 [LOSS Ex1] A: 0.65792 | B: 0.65532 | C: 0.65518 [LOGITS Ex2 A] Mean Abs: 1.736 | Max: 5.602 [LOSS Ex2] A: 0.17778 | B: 0.38006 | C: 0.30747 ** [JOINT LOSS] ** : 0.944578 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004251 | Grad Max: 0.167864 -> Layer: shared_layers.0.bias | Grad Mean: 0.064492 | Grad Max: 0.310871 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002251 | Grad Max: 0.007522 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003842 | Grad Max: 0.003842 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000689 | Grad Max: 0.113062 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011099 | Grad Max: 0.594660 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.003280 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002693 | Grad Max: 0.019047 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000206 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000639 | Grad Max: 0.002233 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000186 | Grad Max: 0.000680 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000355 | Grad Max: 0.001390 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004621 | Grad Max: 0.004621 [GRADIENT NORM TOTAL] 1.8802 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.531 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50161123 0.49838874] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.044 [MASKS] A(Pass/Fail): 634/1414 | B: 494/1362 | C: 180/1196 [LOSS Ex1] A: 0.65691 | B: 0.65866 | C: 0.65502 [LOGITS Ex2 A] Mean Abs: 1.750 | Max: 8.022 [LOSS Ex2] A: 0.20003 | B: 0.36335 | C: 0.31305 ** [JOINT LOSS] ** : 0.949003 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004577 | Grad Max: 0.185731 -> Layer: shared_layers.0.bias | Grad Mean: 0.114685 | Grad Max: 0.377449 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.007257 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000673 | Grad Max: 0.000673 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001006 | Grad Max: 0.122711 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017368 | Grad Max: 0.690644 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000151 | Grad Max: 0.005547 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007055 | Grad Max: 0.042188 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000331 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001682 | Grad Max: 0.004613 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000158 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000502 | Grad Max: 0.001485 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000660 | Grad Max: 0.002127 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011987 | Grad Max: 0.011987 [GRADIENT NORM TOTAL] 2.6601 [EPOCH SUMMARY] Train Loss: 0.9472 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9263 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9346 -> New: 0.9263) ############################## EPOCH 79/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.445 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040043 0.49599573] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 607/1441 | B: 513/1535 | C: 295/1753 [LOSS Ex1] A: 0.66286 | B: 0.65792 | C: 0.65190 [LOGITS Ex2 A] Mean Abs: 1.708 | Max: 5.648 [LOSS Ex2] A: 0.17109 | B: 0.39931 | C: 0.29661 ** [JOINT LOSS] ** : 0.946563 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002451 | Grad Max: 0.103064 -> Layer: shared_layers.0.bias | Grad Mean: 0.077792 | Grad Max: 0.413695 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.006060 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002294 | Grad Max: 0.002294 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000704 | Grad Max: 0.081625 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012203 | Grad Max: 0.453901 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.004265 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005419 | Grad Max: 0.026384 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000301 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001281 | Grad Max: 0.003528 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000112 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000391 | Grad Max: 0.001143 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000617 | Grad Max: 0.001752 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010257 | Grad Max: 0.010257 [GRADIENT NORM TOTAL] 1.8384 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.373 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5324548 0.46754518] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.535 | Std: 0.042 [MASKS] A(Pass/Fail): 603/1445 | B: 528/1520 | C: 266/1782 [LOSS Ex1] A: 0.66258 | B: 0.65857 | C: 0.65282 [LOGITS Ex2 A] Mean Abs: 1.685 | Max: 5.575 [LOSS Ex2] A: 0.18429 | B: 0.39141 | C: 0.28748 ** [JOINT LOSS] ** : 0.945721 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003626 | Grad Max: 0.147380 -> Layer: shared_layers.0.bias | Grad Mean: 0.073593 | Grad Max: 0.311938 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.006505 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003817 | Grad Max: 0.003817 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000695 | Grad Max: 0.140898 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010906 | Grad Max: 0.802423 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.004125 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001874 | Grad Max: 0.025112 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000185 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000252 | Grad Max: 0.001738 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000067 | Grad Max: 0.000355 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000414 | Grad Max: 0.001197 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000384 | Grad Max: 0.000384 [GRADIENT NORM TOTAL] 2.0170 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.507 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.65120375 0.34879625] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.045 [MASKS] A(Pass/Fail): 673/1375 | B: 530/1518 | C: 272/1776 [LOSS Ex1] A: 0.65847 | B: 0.65512 | C: 0.65174 [LOGITS Ex2 A] Mean Abs: 1.757 | Max: 6.255 [LOSS Ex2] A: 0.17917 | B: 0.37153 | C: 0.29102 ** [JOINT LOSS] ** : 0.935683 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002653 | Grad Max: 0.077887 -> Layer: shared_layers.0.bias | Grad Mean: 0.191369 | Grad Max: 0.977474 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.007265 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005062 | Grad Max: 0.005062 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001285 | Grad Max: 0.171834 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022967 | Grad Max: 0.974400 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.008487 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009798 | Grad Max: 0.066481 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000324 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002035 | Grad Max: 0.004794 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000133 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000599 | Grad Max: 0.001487 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000753 | Grad Max: 0.002329 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015087 | Grad Max: 0.015087 [GRADIENT NORM TOTAL] 4.1031 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.570 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005087 0.49949127] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.044 [MASKS] A(Pass/Fail): 648/1400 | B: 494/1362 | C: 279/1769 [LOSS Ex1] A: 0.66321 | B: 0.65846 | C: 0.65420 [LOGITS Ex2 A] Mean Abs: 1.736 | Max: 5.979 [LOSS Ex2] A: 0.17106 | B: 0.37261 | C: 0.29870 ** [JOINT LOSS] ** : 0.939409 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003716 | Grad Max: 0.127788 -> Layer: shared_layers.0.bias | Grad Mean: 0.060660 | Grad Max: 0.276597 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.006691 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004984 | Grad Max: 0.004984 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000648 | Grad Max: 0.149375 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010656 | Grad Max: 0.822123 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000081 | Grad Max: 0.003011 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003539 | Grad Max: 0.017238 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000176 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000904 | Grad Max: 0.002841 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000115 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000270 | Grad Max: 0.001147 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000491 | Grad Max: 0.001879 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006570 | Grad Max: 0.006570 [GRADIENT NORM TOTAL] 1.7765 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.365 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62449944 0.37550056] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.044 [MASKS] A(Pass/Fail): 629/1419 | B: 514/1534 | C: 253/1795 [LOSS Ex1] A: 0.66043 | B: 0.65771 | C: 0.65654 [LOGITS Ex2 A] Mean Abs: 1.733 | Max: 5.716 [LOSS Ex2] A: 0.18506 | B: 0.39926 | C: 0.30108 ** [JOINT LOSS] ** : 0.953363 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002223 | Grad Max: 0.045623 -> Layer: shared_layers.0.bias | Grad Mean: 0.057481 | Grad Max: 0.290084 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.007100 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006154 | Grad Max: 0.006154 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000484 | Grad Max: 0.156067 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008221 | Grad Max: 0.878404 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.003204 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002469 | Grad Max: 0.017738 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000153 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000542 | Grad Max: 0.002370 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000087 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000168 | Grad Max: 0.000685 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000279 | Grad Max: 0.001220 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004460 | Grad Max: 0.004460 [GRADIENT NORM TOTAL] 1.7477 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.441 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57888484 0.42111516] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.538 | Std: 0.045 [MASKS] A(Pass/Fail): 515/1101 | B: 529/1519 | C: 303/1745 [LOSS Ex1] A: 0.65914 | B: 0.65836 | C: 0.65112 [LOGITS Ex2 A] Mean Abs: 1.753 | Max: 5.858 [LOSS Ex2] A: 0.17276 | B: 0.39956 | C: 0.30142 ** [JOINT LOSS] ** : 0.947457 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002764 | Grad Max: 0.073098 -> Layer: shared_layers.0.bias | Grad Mean: 0.166055 | Grad Max: 0.879594 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006944 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005528 | Grad Max: 0.005528 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001111 | Grad Max: 0.182210 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019750 | Grad Max: 1.014890 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000173 | Grad Max: 0.007665 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008454 | Grad Max: 0.055542 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000296 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001744 | Grad Max: 0.004194 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000142 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000512 | Grad Max: 0.001384 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000713 | Grad Max: 0.001892 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013078 | Grad Max: 0.013078 [GRADIENT NORM TOTAL] 3.5383 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.572 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063304 0.4936696] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.538 | Std: 0.045 [MASKS] A(Pass/Fail): 648/1400 | B: 531/1517 | C: 266/1782 [LOSS Ex1] A: 0.65938 | B: 0.65489 | C: 0.65562 [LOGITS Ex2 A] Mean Abs: 1.754 | Max: 6.901 [LOSS Ex2] A: 0.17753 | B: 0.36632 | C: 0.30605 ** [JOINT LOSS] ** : 0.939928 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005196 | Grad Max: 0.226777 -> Layer: shared_layers.0.bias | Grad Mean: 0.062685 | Grad Max: 0.273639 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.006561 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000360 | Grad Max: 0.000360 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000707 | Grad Max: 0.083520 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010466 | Grad Max: 0.465641 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.003081 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002571 | Grad Max: 0.014888 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000676 | Grad Max: 0.002419 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000089 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000224 | Grad Max: 0.000761 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000490 | Grad Max: 0.001562 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005856 | Grad Max: 0.005856 [GRADIENT NORM TOTAL] 1.7188 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.521 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50913763 0.49086234] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.538 | Std: 0.045 [MASKS] A(Pass/Fail): 633/1415 | B: 494/1362 | C: 266/1782 [LOSS Ex1] A: 0.65749 | B: 0.65823 | C: 0.65214 [LOGITS Ex2 A] Mean Abs: 1.774 | Max: 6.798 [LOSS Ex2] A: 0.17036 | B: 0.36847 | C: 0.29542 ** [JOINT LOSS] ** : 0.934036 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003736 | Grad Max: 0.148007 -> Layer: shared_layers.0.bias | Grad Mean: 0.100224 | Grad Max: 0.501254 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.007501 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006337 | Grad Max: 0.006337 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000729 | Grad Max: 0.100359 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012936 | Grad Max: 0.552802 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.004419 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005564 | Grad Max: 0.032200 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000222 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001261 | Grad Max: 0.003261 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000112 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000374 | Grad Max: 0.000982 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000502 | Grad Max: 0.001809 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008883 | Grad Max: 0.008883 [GRADIENT NORM TOTAL] 2.1737 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.538 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50171363 0.4982864 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.538 | Std: 0.045 [MASKS] A(Pass/Fail): 636/1412 | B: 515/1533 | C: 281/1767 [LOSS Ex1] A: 0.65646 | B: 0.65747 | C: 0.65300 [LOGITS Ex2 A] Mean Abs: 1.743 | Max: 6.800 [LOSS Ex2] A: 0.17852 | B: 0.39634 | C: 0.30336 ** [JOINT LOSS] ** : 0.948383 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003174 | Grad Max: 0.075616 -> Layer: shared_layers.0.bias | Grad Mean: 0.224632 | Grad Max: 0.822249 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.007406 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002874 | Grad Max: 0.002874 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001373 | Grad Max: 0.237133 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025856 | Grad Max: 1.320731 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000244 | Grad Max: 0.009497 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012290 | Grad Max: 0.064077 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000410 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002643 | Grad Max: 0.006447 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000207 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000770 | Grad Max: 0.001865 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001088 | Grad Max: 0.002171 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019009 | Grad Max: 0.019009 [GRADIENT NORM TOTAL] 4.4213 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.450 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040556 0.49594438] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.536 | Std: 0.044 [MASKS] A(Pass/Fail): 608/1440 | B: 530/1518 | C: 258/1790 [LOSS Ex1] A: 0.66247 | B: 0.65811 | C: 0.65574 [LOGITS Ex2 A] Mean Abs: 1.742 | Max: 5.857 [LOSS Ex2] A: 0.17921 | B: 0.40000 | C: 0.30683 ** [JOINT LOSS] ** : 0.954122 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005287 | Grad Max: 0.200987 -> Layer: shared_layers.0.bias | Grad Mean: 0.176547 | Grad Max: 0.657440 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.006185 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000866 | Grad Max: 0.000866 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001256 | Grad Max: 0.189459 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022745 | Grad Max: 1.036427 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000218 | Grad Max: 0.005712 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010452 | Grad Max: 0.040634 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000418 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002376 | Grad Max: 0.005451 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000170 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000705 | Grad Max: 0.001772 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001093 | Grad Max: 0.002179 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018207 | Grad Max: 0.018207 [GRADIENT NORM TOTAL] 3.4268 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.082 | Max: 0.378 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5328027 0.46719727] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.043 [MASKS] A(Pass/Fail): 604/1444 | B: 531/1517 | C: 285/1763 [LOSS Ex1] A: 0.66220 | B: 0.65463 | C: 0.65179 [LOGITS Ex2 A] Mean Abs: 1.750 | Max: 6.529 [LOSS Ex2] A: 0.18810 | B: 0.37590 | C: 0.30375 ** [JOINT LOSS] ** : 0.945456 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004526 | Grad Max: 0.157552 -> Layer: shared_layers.0.bias | Grad Mean: 0.325635 | Grad Max: 1.761979 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.006761 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001498 | Grad Max: 0.001498 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.254898 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038141 | Grad Max: 1.436574 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000344 | Grad Max: 0.013436 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017069 | Grad Max: 0.097664 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000525 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003533 | Grad Max: 0.007712 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000232 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001014 | Grad Max: 0.002674 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001152 | Grad Max: 0.002696 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023235 | Grad Max: 0.023235 [GRADIENT NORM TOTAL] 6.8218 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.512 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6533809 0.34661916] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.045 [MASKS] A(Pass/Fail): 674/1374 | B: 494/1362 | C: 298/1750 [LOSS Ex1] A: 0.65804 | B: 0.65799 | C: 0.64955 [LOGITS Ex2 A] Mean Abs: 1.811 | Max: 6.428 [LOSS Ex2] A: 0.16804 | B: 0.38407 | C: 0.29628 ** [JOINT LOSS] ** : 0.937991 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004193 | Grad Max: 0.132219 -> Layer: shared_layers.0.bias | Grad Mean: 0.287439 | Grad Max: 1.408444 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.007067 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003039 | Grad Max: 0.003039 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001964 | Grad Max: 0.246729 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035009 | Grad Max: 1.390565 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000313 | Grad Max: 0.013052 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015431 | Grad Max: 0.090907 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000482 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003201 | Grad Max: 0.007064 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000195 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000919 | Grad Max: 0.002115 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001135 | Grad Max: 0.002449 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021949 | Grad Max: 0.021949 [GRADIENT NORM TOTAL] 6.1488 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.576 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50051105 0.49948892] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.044 [MASKS] A(Pass/Fail): 648/1400 | B: 517/1531 | C: 281/1767 [LOSS Ex1] A: 0.66283 | B: 0.65723 | C: 0.65252 [LOGITS Ex2 A] Mean Abs: 1.793 | Max: 6.365 [LOSS Ex2] A: 0.17454 | B: 0.38762 | C: 0.30392 ** [JOINT LOSS] ** : 0.946222 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004869 | Grad Max: 0.225349 -> Layer: shared_layers.0.bias | Grad Mean: 0.127665 | Grad Max: 0.488586 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.006404 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003277 | Grad Max: 0.003277 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001056 | Grad Max: 0.104239 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016841 | Grad Max: 0.571942 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000121 | Grad Max: 0.006206 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005011 | Grad Max: 0.040804 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000219 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000817 | Grad Max: 0.003009 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000077 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000220 | Grad Max: 0.000675 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000300 | Grad Max: 0.001171 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004760 | Grad Max: 0.004760 [GRADIENT NORM TOTAL] 2.6807 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.370 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62610507 0.3738949 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.045 [MASKS] A(Pass/Fail): 630/1418 | B: 532/1516 | C: 162/1214 [LOSS Ex1] A: 0.66003 | B: 0.65789 | C: 0.65773 [LOGITS Ex2 A] Mean Abs: 1.756 | Max: 6.244 [LOSS Ex2] A: 0.18525 | B: 0.40048 | C: 0.30867 ** [JOINT LOSS] ** : 0.956686 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007315 | Grad Max: 0.200693 -> Layer: shared_layers.0.bias | Grad Mean: 0.361703 | Grad Max: 1.535373 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.006608 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008955 | Grad Max: 0.008955 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002431 | Grad Max: 0.255090 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044959 | Grad Max: 1.412108 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000442 | Grad Max: 0.012128 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021788 | Grad Max: 0.089359 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000618 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004883 | Grad Max: 0.009995 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000312 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001427 | Grad Max: 0.003308 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001900 | Grad Max: 0.003408 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034456 | Grad Max: 0.034456 [GRADIENT NORM TOTAL] 7.0633 [EPOCH SUMMARY] Train Loss: 0.9451 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9278 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 80/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.446 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.57988304 0.420117 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.045 [MASKS] A(Pass/Fail): 520/1096 | B: 532/1516 | C: 278/1770 [LOSS Ex1] A: 0.65874 | B: 0.65441 | C: 0.65175 [LOGITS Ex2 A] Mean Abs: 1.792 | Max: 5.665 [LOSS Ex2] A: 0.15899 | B: 0.38081 | C: 0.29643 ** [JOINT LOSS] ** : 0.933709 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003812 | Grad Max: 0.106518 -> Layer: shared_layers.0.bias | Grad Mean: 0.318918 | Grad Max: 1.383822 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002209 | Grad Max: 0.006912 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005369 | Grad Max: 0.005369 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.230262 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037871 | Grad Max: 1.303062 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000372 | Grad Max: 0.012121 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018615 | Grad Max: 0.093161 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000585 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004030 | Grad Max: 0.009030 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000249 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001161 | Grad Max: 0.002698 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001553 | Grad Max: 0.003124 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027830 | Grad Max: 0.027830 [GRADIENT NORM TOTAL] 6.4406 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.577 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.506386 0.493614] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.538 | Std: 0.046 [MASKS] A(Pass/Fail): 650/1398 | B: 494/1362 | C: 278/1770 [LOSS Ex1] A: 0.65899 | B: 0.65779 | C: 0.65186 [LOGITS Ex2 A] Mean Abs: 1.787 | Max: 7.330 [LOSS Ex2] A: 0.18021 | B: 0.37023 | C: 0.28497 ** [JOINT LOSS] ** : 0.934685 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005131 | Grad Max: 0.251746 -> Layer: shared_layers.0.bias | Grad Mean: 0.074662 | Grad Max: 0.313132 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.006887 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004332 | Grad Max: 0.004332 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000830 | Grad Max: 0.111386 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012817 | Grad Max: 0.591660 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000081 | Grad Max: 0.004387 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002744 | Grad Max: 0.023988 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000215 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000714 | Grad Max: 0.002621 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000089 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000233 | Grad Max: 0.000769 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000425 | Grad Max: 0.001531 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006278 | Grad Max: 0.006278 [GRADIENT NORM TOTAL] 2.1594 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.527 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5092505 0.49074948] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.538 | Std: 0.045 [MASKS] A(Pass/Fail): 635/1413 | B: 518/1530 | C: 277/1771 [LOSS Ex1] A: 0.65708 | B: 0.65703 | C: 0.65397 [LOGITS Ex2 A] Mean Abs: 1.808 | Max: 6.445 [LOSS Ex2] A: 0.18217 | B: 0.39684 | C: 0.29378 ** [JOINT LOSS] ** : 0.946956 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007007 | Grad Max: 0.234749 -> Layer: shared_layers.0.bias | Grad Mean: 0.252796 | Grad Max: 0.845870 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002373 | Grad Max: 0.008229 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015435 | Grad Max: 0.015435 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001791 | Grad Max: 0.168842 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032844 | Grad Max: 0.918997 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000322 | Grad Max: 0.009553 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015619 | Grad Max: 0.066981 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000556 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003471 | Grad Max: 0.008061 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000225 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001001 | Grad Max: 0.002398 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001386 | Grad Max: 0.002717 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024191 | Grad Max: 0.024191 [GRADIENT NORM TOTAL] 4.8399 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.544 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017686 0.4982314] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.538 | Std: 0.045 [MASKS] A(Pass/Fail): 638/1410 | B: 532/1516 | C: 252/1796 [LOSS Ex1] A: 0.65607 | B: 0.65769 | C: 0.65579 [LOGITS Ex2 A] Mean Abs: 1.785 | Max: 7.376 [LOSS Ex2] A: 0.18944 | B: 0.39178 | C: 0.30104 ** [JOINT LOSS] ** : 0.950600 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003959 | Grad Max: 0.136602 -> Layer: shared_layers.0.bias | Grad Mean: 0.054338 | Grad Max: 0.253010 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.007230 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002236 | Grad Max: 0.002236 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000610 | Grad Max: 0.070444 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009009 | Grad Max: 0.364294 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.003610 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001635 | Grad Max: 0.019051 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000174 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000274 | Grad Max: 0.002101 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000069 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000073 | Grad Max: 0.000472 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000207 | Grad Max: 0.000965 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000613 | Grad Max: 0.000613 [GRADIENT NORM TOTAL] 1.4702 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.455 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040564 0.49594364] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.045 [MASKS] A(Pass/Fail): 612/1436 | B: 533/1515 | C: 270/1778 [LOSS Ex1] A: 0.66213 | B: 0.65420 | C: 0.65337 [LOGITS Ex2 A] Mean Abs: 1.757 | Max: 6.065 [LOSS Ex2] A: 0.17243 | B: 0.38750 | C: 0.29575 ** [JOINT LOSS] ** : 0.941794 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005463 | Grad Max: 0.142037 -> Layer: shared_layers.0.bias | Grad Mean: 0.249711 | Grad Max: 1.025673 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002083 | Grad Max: 0.006027 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003902 | Grad Max: 0.003902 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001777 | Grad Max: 0.275897 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032574 | Grad Max: 1.529553 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.007938 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015308 | Grad Max: 0.060142 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000473 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003434 | Grad Max: 0.007402 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000253 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000987 | Grad Max: 0.002747 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001331 | Grad Max: 0.002651 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023474 | Grad Max: 0.023474 [GRADIENT NORM TOTAL] 5.1663 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.083 | Max: 0.383 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53318006 0.4668199 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.536 | Std: 0.044 [MASKS] A(Pass/Fail): 605/1443 | B: 496/1360 | C: 289/1759 [LOSS Ex1] A: 0.66187 | B: 0.65759 | C: 0.65326 [LOGITS Ex2 A] Mean Abs: 1.749 | Max: 6.934 [LOSS Ex2] A: 0.18677 | B: 0.36922 | C: 0.33461 ** [JOINT LOSS] ** : 0.954435 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005685 | Grad Max: 0.217202 -> Layer: shared_layers.0.bias | Grad Mean: 0.080434 | Grad Max: 0.335134 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002026 | Grad Max: 0.006257 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002974 | Grad Max: 0.002974 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000836 | Grad Max: 0.151741 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013534 | Grad Max: 0.855128 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000110 | Grad Max: 0.003789 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004704 | Grad Max: 0.022593 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000332 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001211 | Grad Max: 0.003537 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000142 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000370 | Grad Max: 0.001260 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000607 | Grad Max: 0.001497 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009672 | Grad Max: 0.009672 [GRADIENT NORM TOTAL] 2.2377 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.518 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.65540504 0.34459496] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 676/1372 | B: 519/1529 | C: 280/1768 [LOSS Ex1] A: 0.65766 | B: 0.65682 | C: 0.65187 [LOGITS Ex2 A] Mean Abs: 1.824 | Max: 6.250 [LOSS Ex2] A: 0.17724 | B: 0.40505 | C: 0.30787 ** [JOINT LOSS] ** : 0.952171 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003798 | Grad Max: 0.130572 -> Layer: shared_layers.0.bias | Grad Mean: 0.364875 | Grad Max: 1.662331 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002217 | Grad Max: 0.007397 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005895 | Grad Max: 0.005895 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002376 | Grad Max: 0.233082 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043714 | Grad Max: 1.272204 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000417 | Grad Max: 0.017302 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020948 | Grad Max: 0.123132 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000676 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004394 | Grad Max: 0.009941 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000254 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001247 | Grad Max: 0.002953 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001570 | Grad Max: 0.002875 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029652 | Grad Max: 0.029652 [GRADIENT NORM TOTAL] 7.5343 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.582 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005553 0.49944472] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.538 | Std: 0.045 [MASKS] A(Pass/Fail): 650/1398 | B: 534/1514 | C: 281/1767 [LOSS Ex1] A: 0.66250 | B: 0.65749 | C: 0.65092 [LOGITS Ex2 A] Mean Abs: 1.804 | Max: 5.961 [LOSS Ex2] A: 0.17371 | B: 0.40010 | C: 0.30225 ** [JOINT LOSS] ** : 0.948993 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004777 | Grad Max: 0.175150 -> Layer: shared_layers.0.bias | Grad Mean: 0.413692 | Grad Max: 2.114551 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002046 | Grad Max: 0.005867 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004508 | Grad Max: 0.004508 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002640 | Grad Max: 0.282084 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047429 | Grad Max: 1.577381 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000432 | Grad Max: 0.015966 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021626 | Grad Max: 0.122072 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000679 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004451 | Grad Max: 0.010409 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000275 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001259 | Grad Max: 0.002901 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001502 | Grad Max: 0.003280 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028970 | Grad Max: 0.028970 [GRADIENT NORM TOTAL] 8.4697 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.375 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62770534 0.37229466] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.538 | Std: 0.046 [MASKS] A(Pass/Fail): 637/1411 | B: 535/1513 | C: 263/1785 [LOSS Ex1] A: 0.65968 | B: 0.65399 | C: 0.65401 [LOGITS Ex2 A] Mean Abs: 1.786 | Max: 6.067 [LOSS Ex2] A: 0.19398 | B: 0.37597 | C: 0.30405 ** [JOINT LOSS] ** : 0.947222 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003233 | Grad Max: 0.119968 -> Layer: shared_layers.0.bias | Grad Mean: 0.125490 | Grad Max: 0.544307 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.006798 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000121 | Grad Max: 0.000121 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001010 | Grad Max: 0.143216 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017047 | Grad Max: 0.802015 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.006388 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005730 | Grad Max: 0.044852 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000237 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001077 | Grad Max: 0.003231 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000295 | Grad Max: 0.000958 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000325 | Grad Max: 0.001218 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006754 | Grad Max: 0.006754 [GRADIENT NORM TOTAL] 2.8616 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.451 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5809616 0.41903844] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 524/1092 | B: 502/1354 | C: 268/1780 [LOSS Ex1] A: 0.65837 | B: 0.65740 | C: 0.65142 [LOGITS Ex2 A] Mean Abs: 1.788 | Max: 5.890 [LOSS Ex2] A: 0.16700 | B: 0.38268 | C: 0.29953 ** [JOINT LOSS] ** : 0.938797 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006325 | Grad Max: 0.169389 -> Layer: shared_layers.0.bias | Grad Mean: 0.379571 | Grad Max: 1.499977 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.006777 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001250 | Grad Max: 0.001250 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002442 | Grad Max: 0.300442 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045395 | Grad Max: 1.641799 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000433 | Grad Max: 0.014549 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021752 | Grad Max: 0.108752 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000647 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004796 | Grad Max: 0.010372 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000295 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001378 | Grad Max: 0.003165 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001746 | Grad Max: 0.003253 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032348 | Grad Max: 0.032348 [GRADIENT NORM TOTAL] 7.3708 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.584 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063782 0.4936218] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 653/1395 | B: 531/1517 | C: 292/1756 [LOSS Ex1] A: 0.65863 | B: 0.65663 | C: 0.65051 [LOGITS Ex2 A] Mean Abs: 1.758 | Max: 6.218 [LOSS Ex2] A: 0.17258 | B: 0.41642 | C: 0.30186 ** [JOINT LOSS] ** : 0.952209 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005457 | Grad Max: 0.150562 -> Layer: shared_layers.0.bias | Grad Mean: 0.467417 | Grad Max: 1.979501 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.007050 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003038 | Grad Max: 0.003038 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002897 | Grad Max: 0.362920 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054374 | Grad Max: 2.054542 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000523 | Grad Max: 0.017443 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026541 | Grad Max: 0.122004 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000729 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005777 | Grad Max: 0.011777 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000350 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001666 | Grad Max: 0.003949 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002056 | Grad Max: 0.003977 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039571 | Grad Max: 0.039571 [GRADIENT NORM TOTAL] 9.2950 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.533 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50946397 0.49053603] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 637/1411 | B: 548/1500 | C: 276/1772 [LOSS Ex1] A: 0.65668 | B: 0.65731 | C: 0.65248 [LOGITS Ex2 A] Mean Abs: 1.776 | Max: 6.196 [LOSS Ex2] A: 0.17139 | B: 0.39697 | C: 0.30910 ** [JOINT LOSS] ** : 0.947977 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004150 | Grad Max: 0.117199 -> Layer: shared_layers.0.bias | Grad Mean: 0.223697 | Grad Max: 0.991245 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.006833 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000541 | Grad Max: 0.000541 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001496 | Grad Max: 0.201495 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026308 | Grad Max: 1.124585 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000216 | Grad Max: 0.007979 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010737 | Grad Max: 0.056171 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000382 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002223 | Grad Max: 0.005277 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000155 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000646 | Grad Max: 0.001609 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000800 | Grad Max: 0.001830 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015819 | Grad Max: 0.015819 [GRADIENT NORM TOTAL] 4.6066 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.550 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017539 0.49824604] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 639/1409 | B: 545/1503 | C: 272/1776 [LOSS Ex1] A: 0.65567 | B: 0.65380 | C: 0.65229 [LOGITS Ex2 A] Mean Abs: 1.795 | Max: 7.149 [LOSS Ex2] A: 0.20459 | B: 0.37618 | C: 0.31133 ** [JOINT LOSS] ** : 0.951289 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010196 | Grad Max: 0.377459 -> Layer: shared_layers.0.bias | Grad Mean: 0.380662 | Grad Max: 1.676027 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002304 | Grad Max: 0.007946 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006979 | Grad Max: 0.006979 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002804 | Grad Max: 0.259710 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050634 | Grad Max: 1.343517 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000492 | Grad Max: 0.013404 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024100 | Grad Max: 0.102511 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000809 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005502 | Grad Max: 0.012403 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000330 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001617 | Grad Max: 0.003794 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002184 | Grad Max: 0.004465 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039591 | Grad Max: 0.039591 [GRADIENT NORM TOTAL] 7.4215 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.460 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5039963 0.4960037] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.045 [MASKS] A(Pass/Fail): 617/1431 | B: 504/1352 | C: 194/1182 [LOSS Ex1] A: 0.66182 | B: 0.65723 | C: 0.65004 [LOGITS Ex2 A] Mean Abs: 1.788 | Max: 6.353 [LOSS Ex2] A: 0.17985 | B: 0.36824 | C: 0.28656 ** [JOINT LOSS] ** : 0.934576 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009592 | Grad Max: 0.302949 -> Layer: shared_layers.0.bias | Grad Mean: 0.384959 | Grad Max: 1.623849 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.006001 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000258 | Grad Max: 0.000258 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002698 | Grad Max: 0.294098 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048885 | Grad Max: 1.511602 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000477 | Grad Max: 0.013335 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023512 | Grad Max: 0.101377 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000753 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005312 | Grad Max: 0.011197 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000317 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001554 | Grad Max: 0.003447 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001941 | Grad Max: 0.004072 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036445 | Grad Max: 0.036445 [GRADIENT NORM TOTAL] 7.4543 [EPOCH SUMMARY] Train Loss: 0.9454 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9219 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9263 -> New: 0.9219) ############################## EPOCH 81/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.084 | Max: 0.389 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5335998 0.4664002] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.044 [MASKS] A(Pass/Fail): 606/1442 | B: 534/1514 | C: 273/1775 [LOSS Ex1] A: 0.66156 | B: 0.65647 | C: 0.65295 [LOGITS Ex2 A] Mean Abs: 1.716 | Max: 6.033 [LOSS Ex2] A: 0.18334 | B: 0.39734 | C: 0.32704 ** [JOINT LOSS] ** : 0.959564 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006662 | Grad Max: 0.204318 -> Layer: shared_layers.0.bias | Grad Mean: 0.153545 | Grad Max: 0.655684 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.006340 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003585 | Grad Max: 0.003585 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001249 | Grad Max: 0.117567 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022055 | Grad Max: 0.628844 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000205 | Grad Max: 0.005546 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009831 | Grad Max: 0.036649 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000422 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002287 | Grad Max: 0.005443 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000164 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000666 | Grad Max: 0.001698 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000869 | Grad Max: 0.001855 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015670 | Grad Max: 0.015670 [GRADIENT NORM TOTAL] 3.0837 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.523 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6573193 0.34268063] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.046 [MASKS] A(Pass/Fail): 679/1369 | B: 550/1498 | C: 282/1766 [LOSS Ex1] A: 0.65732 | B: 0.65716 | C: 0.65029 [LOGITS Ex2 A] Mean Abs: 1.720 | Max: 6.254 [LOSS Ex2] A: 0.17187 | B: 0.41470 | C: 0.27924 ** [JOINT LOSS] ** : 0.943525 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005954 | Grad Max: 0.168725 -> Layer: shared_layers.0.bias | Grad Mean: 0.510062 | Grad Max: 2.350912 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002214 | Grad Max: 0.006980 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006036 | Grad Max: 0.006036 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003226 | Grad Max: 0.356363 -> Layer: exit2_layers.0.bias | Grad Mean: 0.061098 | Grad Max: 2.002646 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000607 | Grad Max: 0.018669 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030785 | Grad Max: 0.140951 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000852 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006671 | Grad Max: 0.014129 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000391 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001906 | Grad Max: 0.004507 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002274 | Grad Max: 0.003838 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043489 | Grad Max: 0.043489 [GRADIENT NORM TOTAL] 10.0719 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.587 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006303 0.49936968] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.538 | Std: 0.046 [MASKS] A(Pass/Fail): 653/1395 | B: 546/1502 | C: 308/1740 [LOSS Ex1] A: 0.66221 | B: 0.65364 | C: 0.64951 [LOGITS Ex2 A] Mean Abs: 1.709 | Max: 5.831 [LOSS Ex2] A: 0.18140 | B: 0.40672 | C: 0.31290 ** [JOINT LOSS] ** : 0.955462 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012038 | Grad Max: 0.333008 -> Layer: shared_layers.0.bias | Grad Mean: 0.687403 | Grad Max: 3.082189 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.006643 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006442 | Grad Max: 0.006442 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004619 | Grad Max: 0.439915 -> Layer: exit2_layers.0.bias | Grad Mean: 0.086036 | Grad Max: 2.304513 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000856 | Grad Max: 0.025002 -> Layer: exit2_layers.3.bias | Grad Mean: 0.042994 | Grad Max: 0.196189 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000111 | Grad Max: 0.001231 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009510 | Grad Max: 0.019413 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000525 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002747 | Grad Max: 0.006288 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003361 | Grad Max: 0.006337 -> Layer: exit2_layers.12.bias | Grad Mean: 0.064008 | Grad Max: 0.064008 [GRADIENT NORM TOTAL] 13.3274 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.380 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62911975 0.37088025] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.538 | Std: 0.046 [MASKS] A(Pass/Fail): 637/1411 | B: 504/1352 | C: 248/1800 [LOSS Ex1] A: 0.65937 | B: 0.65708 | C: 0.65271 [LOGITS Ex2 A] Mean Abs: 1.706 | Max: 5.993 [LOSS Ex2] A: 0.19274 | B: 0.38035 | C: 0.30062 ** [JOINT LOSS] ** : 0.947623 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009145 | Grad Max: 0.266270 -> Layer: shared_layers.0.bias | Grad Mean: 0.495783 | Grad Max: 2.127459 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002145 | Grad Max: 0.007262 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004976 | Grad Max: 0.004976 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003243 | Grad Max: 0.283185 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060574 | Grad Max: 1.393398 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000608 | Grad Max: 0.019339 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030545 | Grad Max: 0.140382 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000872 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006732 | Grad Max: 0.013879 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000422 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001926 | Grad Max: 0.004537 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002324 | Grad Max: 0.003990 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043689 | Grad Max: 0.043689 [GRADIENT NORM TOTAL] 9.3145 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.455 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5818856 0.41811445] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.046 [MASKS] A(Pass/Fail): 526/1090 | B: 534/1514 | C: 292/1756 [LOSS Ex1] A: 0.65806 | B: 0.65632 | C: 0.64982 [LOGITS Ex2 A] Mean Abs: 1.765 | Max: 5.954 [LOSS Ex2] A: 0.17247 | B: 0.38988 | C: 0.27920 ** [JOINT LOSS] ** : 0.935255 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005890 | Grad Max: 0.199373 -> Layer: shared_layers.0.bias | Grad Mean: 0.092362 | Grad Max: 0.485583 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.006604 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003541 | Grad Max: 0.003541 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000830 | Grad Max: 0.074697 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011838 | Grad Max: 0.398093 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.003643 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002247 | Grad Max: 0.020341 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000237 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000485 | Grad Max: 0.002336 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000086 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000149 | Grad Max: 0.000626 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000510 | Grad Max: 0.001616 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004009 | Grad Max: 0.004009 [GRADIENT NORM TOTAL] 2.0417 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.589 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5064215 0.49357846] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.047 [MASKS] A(Pass/Fail): 654/1394 | B: 550/1498 | C: 270/1778 [LOSS Ex1] A: 0.65834 | B: 0.65702 | C: 0.65164 [LOGITS Ex2 A] Mean Abs: 1.793 | Max: 6.515 [LOSS Ex2] A: 0.17342 | B: 0.40667 | C: 0.31212 ** [JOINT LOSS] ** : 0.953069 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005190 | Grad Max: 0.202632 -> Layer: shared_layers.0.bias | Grad Mean: 0.558156 | Grad Max: 2.733904 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.006709 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001161 | Grad Max: 0.001161 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003437 | Grad Max: 0.356448 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064283 | Grad Max: 2.029707 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000612 | Grad Max: 0.021178 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031325 | Grad Max: 0.160414 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000928 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006791 | Grad Max: 0.014095 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000367 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001972 | Grad Max: 0.004459 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002354 | Grad Max: 0.004395 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046288 | Grad Max: 0.046288 [GRADIENT NORM TOTAL] 11.2901 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.538 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50961715 0.49038285] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 643/1405 | B: 548/1500 | C: 275/1773 [LOSS Ex1] A: 0.65638 | B: 0.65349 | C: 0.65249 [LOGITS Ex2 A] Mean Abs: 1.800 | Max: 5.580 [LOSS Ex2] A: 0.18451 | B: 0.41337 | C: 0.31277 ** [JOINT LOSS] ** : 0.957665 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009634 | Grad Max: 0.242697 -> Layer: shared_layers.0.bias | Grad Mean: 0.728804 | Grad Max: 3.301135 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.007279 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004122 | Grad Max: 0.004122 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004575 | Grad Max: 0.506763 -> Layer: exit2_layers.0.bias | Grad Mean: 0.086158 | Grad Max: 2.765893 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000818 | Grad Max: 0.025991 -> Layer: exit2_layers.3.bias | Grad Mean: 0.041552 | Grad Max: 0.196951 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001241 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009075 | Grad Max: 0.019124 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000045 | Grad Max: 0.000509 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002619 | Grad Max: 0.006123 -> Layer: exit2_layers.12.weight | Grad Mean: 0.003188 | Grad Max: 0.006005 -> Layer: exit2_layers.12.bias | Grad Mean: 0.060739 | Grad Max: 0.060739 [GRADIENT NORM TOTAL] 14.3447 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.554 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50176126 0.4982388 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 640/1408 | B: 505/1351 | C: 260/1788 [LOSS Ex1] A: 0.65537 | B: 0.65695 | C: 0.65308 [LOGITS Ex2 A] Mean Abs: 1.788 | Max: 5.817 [LOSS Ex2] A: 0.20228 | B: 0.37004 | C: 0.29642 ** [JOINT LOSS] ** : 0.944716 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009152 | Grad Max: 0.239818 -> Layer: shared_layers.0.bias | Grad Mean: 0.554160 | Grad Max: 2.352176 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.007452 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004804 | Grad Max: 0.004804 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003715 | Grad Max: 0.355053 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069262 | Grad Max: 1.901276 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000659 | Grad Max: 0.020432 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033139 | Grad Max: 0.152170 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000926 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007333 | Grad Max: 0.014796 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000407 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002150 | Grad Max: 0.004786 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002676 | Grad Max: 0.005292 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051279 | Grad Max: 0.051279 [GRADIENT NORM TOTAL] 10.9719 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.463 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50400186 0.49599817] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.537 | Std: 0.045 [MASKS] A(Pass/Fail): 622/1426 | B: 535/1513 | C: 280/1768 [LOSS Ex1] A: 0.66158 | B: 0.65619 | C: 0.65250 [LOGITS Ex2 A] Mean Abs: 1.725 | Max: 5.423 [LOSS Ex2] A: 0.16935 | B: 0.39359 | C: 0.30681 ** [JOINT LOSS] ** : 0.946677 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004314 | Grad Max: 0.150054 -> Layer: shared_layers.0.bias | Grad Mean: 0.060726 | Grad Max: 0.388063 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002011 | Grad Max: 0.006066 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004864 | Grad Max: 0.004864 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000566 | Grad Max: 0.077248 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008552 | Grad Max: 0.428223 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.003322 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001659 | Grad Max: 0.018447 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000163 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000300 | Grad Max: 0.001628 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000097 | Grad Max: 0.000591 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000368 | Grad Max: 0.001079 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002054 | Grad Max: 0.002054 [GRADIENT NORM TOTAL] 1.5528 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.392 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53387016 0.46612987] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.044 [MASKS] A(Pass/Fail): 610/1438 | B: 551/1497 | C: 280/1768 [LOSS Ex1] A: 0.66132 | B: 0.65689 | C: 0.65225 [LOGITS Ex2 A] Mean Abs: 1.656 | Max: 5.785 [LOSS Ex2] A: 0.18715 | B: 0.40788 | C: 0.29706 ** [JOINT LOSS] ** : 0.954182 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006167 | Grad Max: 0.162745 -> Layer: shared_layers.0.bias | Grad Mean: 0.485147 | Grad Max: 2.235384 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002032 | Grad Max: 0.006252 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000606 | Grad Max: 0.000606 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003055 | Grad Max: 0.257413 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057893 | Grad Max: 1.454363 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000575 | Grad Max: 0.016645 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029197 | Grad Max: 0.136177 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000839 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006349 | Grad Max: 0.013603 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000396 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001822 | Grad Max: 0.004483 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002183 | Grad Max: 0.003967 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042400 | Grad Max: 0.042400 [GRADIENT NORM TOTAL] 9.4466 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.526 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.65864146 0.34135848] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.047 [MASKS] A(Pass/Fail): 680/1368 | B: 549/1499 | C: 289/1759 [LOSS Ex1] A: 0.65706 | B: 0.65336 | C: 0.65015 [LOGITS Ex2 A] Mean Abs: 1.695 | Max: 6.947 [LOSS Ex2] A: 0.17090 | B: 0.39574 | C: 0.30140 ** [JOINT LOSS] ** : 0.942869 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006457 | Grad Max: 0.160879 -> Layer: shared_layers.0.bias | Grad Mean: 0.496281 | Grad Max: 2.272225 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002350 | Grad Max: 0.007292 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006852 | Grad Max: 0.006852 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003161 | Grad Max: 0.339691 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059380 | Grad Max: 1.898198 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000570 | Grad Max: 0.017577 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028923 | Grad Max: 0.141136 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000871 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006333 | Grad Max: 0.013727 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000415 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001814 | Grad Max: 0.004707 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002093 | Grad Max: 0.003605 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040573 | Grad Max: 0.040573 [GRADIENT NORM TOTAL] 9.7704 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.591 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50052077 0.49947926] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 656/1392 | B: 506/1350 | C: 272/1776 [LOSS Ex1] A: 0.66199 | B: 0.65682 | C: 0.65269 [LOGITS Ex2 A] Mean Abs: 1.720 | Max: 5.857 [LOSS Ex2] A: 0.16522 | B: 0.37229 | C: 0.30242 ** [JOINT LOSS] ** : 0.937140 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005655 | Grad Max: 0.153572 -> Layer: shared_layers.0.bias | Grad Mean: 0.314178 | Grad Max: 1.422749 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.006150 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003119 | Grad Max: 0.003119 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.225124 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038147 | Grad Max: 1.268969 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000371 | Grad Max: 0.011168 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018709 | Grad Max: 0.085546 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000640 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004124 | Grad Max: 0.009411 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000298 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001176 | Grad Max: 0.003369 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001396 | Grad Max: 0.002995 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026654 | Grad Max: 0.026654 [GRADIENT NORM TOTAL] 6.0698 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.382 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6299276 0.37007245] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 637/1411 | B: 535/1513 | C: 265/1783 [LOSS Ex1] A: 0.65913 | B: 0.65605 | C: 0.65250 [LOGITS Ex2 A] Mean Abs: 1.756 | Max: 5.968 [LOSS Ex2] A: 0.17657 | B: 0.39061 | C: 0.29676 ** [JOINT LOSS] ** : 0.943877 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002803 | Grad Max: 0.085848 -> Layer: shared_layers.0.bias | Grad Mean: 0.253659 | Grad Max: 1.076225 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.006763 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002749 | Grad Max: 0.002749 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001638 | Grad Max: 0.178319 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030474 | Grad Max: 1.004703 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.010435 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015485 | Grad Max: 0.077942 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000509 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003326 | Grad Max: 0.007456 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000200 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000957 | Grad Max: 0.002166 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001118 | Grad Max: 0.002863 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022188 | Grad Max: 0.022188 [GRADIENT NORM TOTAL] 5.0779 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.458 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58232754 0.4176725 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.047 [MASKS] A(Pass/Fail): 526/1090 | B: 553/1495 | C: 180/1196 [LOSS Ex1] A: 0.65781 | B: 0.65675 | C: 0.65328 [LOGITS Ex2 A] Mean Abs: 1.810 | Max: 5.680 [LOSS Ex2] A: 0.16449 | B: 0.39501 | C: 0.30016 ** [JOINT LOSS] ** : 0.942499 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004469 | Grad Max: 0.136601 -> Layer: shared_layers.0.bias | Grad Mean: 0.407912 | Grad Max: 1.738238 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.006476 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001340 | Grad Max: 0.001340 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002570 | Grad Max: 0.282283 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048243 | Grad Max: 1.595436 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000489 | Grad Max: 0.016911 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024875 | Grad Max: 0.124885 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000803 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005385 | Grad Max: 0.012196 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000310 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001558 | Grad Max: 0.003609 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001870 | Grad Max: 0.004041 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036221 | Grad Max: 0.036221 [GRADIENT NORM TOTAL] 8.2101 [EPOCH SUMMARY] Train Loss: 0.9474 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9240 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 82/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.592 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5065801 0.49341986] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.047 [MASKS] A(Pass/Fail): 654/1394 | B: 549/1499 | C: 270/1778 [LOSS Ex1] A: 0.65811 | B: 0.65322 | C: 0.65337 [LOGITS Ex2 A] Mean Abs: 1.778 | Max: 5.847 [LOSS Ex2] A: 0.16819 | B: 0.37460 | C: 0.29584 ** [JOINT LOSS] ** : 0.934444 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002571 | Grad Max: 0.120144 -> Layer: shared_layers.0.bias | Grad Mean: 0.285165 | Grad Max: 1.381527 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.007420 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009273 | Grad Max: 0.009273 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001812 | Grad Max: 0.179918 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033351 | Grad Max: 1.017690 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000353 | Grad Max: 0.012483 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018033 | Grad Max: 0.096063 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000513 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003891 | Grad Max: 0.008151 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000235 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001148 | Grad Max: 0.002598 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001390 | Grad Max: 0.003307 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027690 | Grad Max: 0.027690 [GRADIENT NORM TOTAL] 5.6790 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.541 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5095515 0.49044845] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.047 [MASKS] A(Pass/Fail): 645/1403 | B: 506/1350 | C: 291/1757 [LOSS Ex1] A: 0.65614 | B: 0.65669 | C: 0.65118 [LOGITS Ex2 A] Mean Abs: 1.727 | Max: 5.924 [LOSS Ex2] A: 0.17421 | B: 0.37842 | C: 0.30911 ** [JOINT LOSS] ** : 0.941919 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004113 | Grad Max: 0.101901 -> Layer: shared_layers.0.bias | Grad Mean: 0.268569 | Grad Max: 1.062521 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002245 | Grad Max: 0.007595 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007603 | Grad Max: 0.007603 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001639 | Grad Max: 0.306845 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030542 | Grad Max: 1.729981 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000263 | Grad Max: 0.008323 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013412 | Grad Max: 0.061444 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000516 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002960 | Grad Max: 0.006557 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000212 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000855 | Grad Max: 0.002264 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001062 | Grad Max: 0.002376 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020204 | Grad Max: 0.020204 [GRADIENT NORM TOTAL] 5.6833 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.558 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5019347 0.49806532] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 640/1408 | B: 535/1513 | C: 257/1791 [LOSS Ex1] A: 0.65513 | B: 0.65593 | C: 0.65289 [LOGITS Ex2 A] Mean Abs: 1.702 | Max: 6.407 [LOSS Ex2] A: 0.19064 | B: 0.41444 | C: 0.29686 ** [JOINT LOSS] ** : 0.955299 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007640 | Grad Max: 0.225978 -> Layer: shared_layers.0.bias | Grad Mean: 0.537965 | Grad Max: 2.274895 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002217 | Grad Max: 0.007554 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004831 | Grad Max: 0.004831 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003299 | Grad Max: 0.462809 -> Layer: exit2_layers.0.bias | Grad Mean: 0.061941 | Grad Max: 2.497517 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000575 | Grad Max: 0.019090 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029345 | Grad Max: 0.141659 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000861 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006452 | Grad Max: 0.012990 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000404 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001870 | Grad Max: 0.004502 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002232 | Grad Max: 0.004017 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043191 | Grad Max: 0.043191 [GRADIENT NORM TOTAL] 10.6399 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.466 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041577 0.49584228] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.538 | Std: 0.046 [MASKS] A(Pass/Fail): 623/1425 | B: 554/1494 | C: 280/1768 [LOSS Ex1] A: 0.66137 | B: 0.65664 | C: 0.65091 [LOGITS Ex2 A] Mean Abs: 1.686 | Max: 5.590 [LOSS Ex2] A: 0.16817 | B: 0.39703 | C: 0.28469 ** [JOINT LOSS] ** : 0.939600 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007198 | Grad Max: 0.184984 -> Layer: shared_layers.0.bias | Grad Mean: 0.439171 | Grad Max: 1.656231 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002002 | Grad Max: 0.006484 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007226 | Grad Max: 0.007226 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002783 | Grad Max: 0.392639 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052201 | Grad Max: 2.130070 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000510 | Grad Max: 0.015474 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025881 | Grad Max: 0.113705 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000786 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005693 | Grad Max: 0.013260 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000352 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001638 | Grad Max: 0.004158 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001968 | Grad Max: 0.003854 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037296 | Grad Max: 0.037296 [GRADIENT NORM TOTAL] 8.5420 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.395 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5339173 0.46608266] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.045 [MASKS] A(Pass/Fail): 611/1437 | B: 549/1499 | C: 288/1760 [LOSS Ex1] A: 0.66113 | B: 0.65310 | C: 0.65098 [LOGITS Ex2 A] Mean Abs: 1.712 | Max: 5.631 [LOSS Ex2] A: 0.17902 | B: 0.37392 | C: 0.30741 ** [JOINT LOSS] ** : 0.941851 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002365 | Grad Max: 0.052858 -> Layer: shared_layers.0.bias | Grad Mean: 0.148174 | Grad Max: 0.678899 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.006992 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009358 | Grad Max: 0.009358 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001035 | Grad Max: 0.127595 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019085 | Grad Max: 0.712854 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.006747 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008843 | Grad Max: 0.040413 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000295 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001918 | Grad Max: 0.004924 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000142 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000545 | Grad Max: 0.001358 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000591 | Grad Max: 0.001754 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011947 | Grad Max: 0.011947 [GRADIENT NORM TOTAL] 3.1570 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.529 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6596345 0.34036553] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.047 [MASKS] A(Pass/Fail): 681/1367 | B: 506/1350 | C: 288/1760 [LOSS Ex1] A: 0.65686 | B: 0.65658 | C: 0.64967 [LOGITS Ex2 A] Mean Abs: 1.768 | Max: 5.645 [LOSS Ex2] A: 0.16717 | B: 0.36816 | C: 0.30261 ** [JOINT LOSS] ** : 0.933681 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004780 | Grad Max: 0.119598 -> Layer: shared_layers.0.bias | Grad Mean: 0.276967 | Grad Max: 1.196343 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.006994 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004872 | Grad Max: 0.004872 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001902 | Grad Max: 0.193620 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035188 | Grad Max: 1.082307 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000323 | Grad Max: 0.011038 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016437 | Grad Max: 0.081377 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000547 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003665 | Grad Max: 0.007795 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000207 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001071 | Grad Max: 0.002599 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001337 | Grad Max: 0.003164 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025583 | Grad Max: 0.025583 [GRADIENT NORM TOTAL] 5.6112 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.594 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004416 0.4995584] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 656/1392 | B: 535/1513 | C: 295/1753 [LOSS Ex1] A: 0.66181 | B: 0.65581 | C: 0.65053 [LOGITS Ex2 A] Mean Abs: 1.746 | Max: 5.401 [LOSS Ex2] A: 0.15723 | B: 0.39486 | C: 0.28746 ** [JOINT LOSS] ** : 0.935899 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001664 | Grad Max: 0.032129 -> Layer: shared_layers.0.bias | Grad Mean: 0.025284 | Grad Max: 0.117491 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.006042 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001326 | Grad Max: 0.001326 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000359 | Grad Max: 0.060886 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005808 | Grad Max: 0.323983 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002608 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001487 | Grad Max: 0.011405 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000142 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000282 | Grad Max: 0.001852 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000074 | Grad Max: 0.000395 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000426 | Grad Max: 0.000968 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000678 | Grad Max: 0.000678 [GRADIENT NORM TOTAL] 0.9092 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.385 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63071036 0.36928967] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.047 [MASKS] A(Pass/Fail): 638/1410 | B: 556/1492 | C: 279/1769 [LOSS Ex1] A: 0.65892 | B: 0.65651 | C: 0.65055 [LOGITS Ex2 A] Mean Abs: 1.733 | Max: 5.937 [LOSS Ex2] A: 0.17962 | B: 0.38824 | C: 0.29462 ** [JOINT LOSS] ** : 0.942819 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003933 | Grad Max: 0.092307 -> Layer: shared_layers.0.bias | Grad Mean: 0.280926 | Grad Max: 1.261168 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.006211 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000546 | Grad Max: 0.000546 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001786 | Grad Max: 0.125419 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033547 | Grad Max: 0.706983 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.010552 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017070 | Grad Max: 0.081388 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000489 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003728 | Grad Max: 0.007781 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000228 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001071 | Grad Max: 0.002605 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001268 | Grad Max: 0.002569 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024460 | Grad Max: 0.024460 [GRADIENT NORM TOTAL] 5.2986 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.461 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58279115 0.41720888] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.047 [MASKS] A(Pass/Fail): 526/1090 | B: 551/1497 | C: 269/1779 [LOSS Ex1] A: 0.65759 | B: 0.65296 | C: 0.65332 [LOGITS Ex2 A] Mean Abs: 1.767 | Max: 5.667 [LOSS Ex2] A: 0.17001 | B: 0.37627 | C: 0.29533 ** [JOINT LOSS] ** : 0.935160 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002647 | Grad Max: 0.085992 -> Layer: shared_layers.0.bias | Grad Mean: 0.231150 | Grad Max: 1.094567 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.006859 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005953 | Grad Max: 0.005953 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001557 | Grad Max: 0.211276 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029016 | Grad Max: 1.189796 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.010034 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013965 | Grad Max: 0.068089 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000435 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003024 | Grad Max: 0.006864 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000191 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000878 | Grad Max: 0.002094 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001108 | Grad Max: 0.002507 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020441 | Grad Max: 0.020441 [GRADIENT NORM TOTAL] 4.8683 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.595 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5066391 0.49336094] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.047 [MASKS] A(Pass/Fail): 656/1392 | B: 506/1350 | C: 249/1799 [LOSS Ex1] A: 0.65789 | B: 0.65643 | C: 0.65249 [LOGITS Ex2 A] Mean Abs: 1.802 | Max: 5.996 [LOSS Ex2] A: 0.17782 | B: 0.36830 | C: 0.31688 ** [JOINT LOSS] ** : 0.943269 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004236 | Grad Max: 0.096834 -> Layer: shared_layers.0.bias | Grad Mean: 0.247909 | Grad Max: 1.074477 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.006610 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002928 | Grad Max: 0.002928 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001594 | Grad Max: 0.208197 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029353 | Grad Max: 1.144679 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000269 | Grad Max: 0.008641 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013693 | Grad Max: 0.068626 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000442 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003028 | Grad Max: 0.006962 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000217 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000876 | Grad Max: 0.002308 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001017 | Grad Max: 0.002309 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019686 | Grad Max: 0.019686 [GRADIENT NORM TOTAL] 5.0265 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.545 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5096312 0.49036878] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.047 [MASKS] A(Pass/Fail): 649/1399 | B: 535/1513 | C: 287/1761 [LOSS Ex1] A: 0.65589 | B: 0.65566 | C: 0.64894 [LOGITS Ex2 A] Mean Abs: 1.797 | Max: 5.219 [LOSS Ex2] A: 0.17316 | B: 0.39851 | C: 0.30095 ** [JOINT LOSS] ** : 0.944370 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006472 | Grad Max: 0.192561 -> Layer: shared_layers.0.bias | Grad Mean: 0.352406 | Grad Max: 1.554785 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.007657 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006991 | Grad Max: 0.006991 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.248939 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041080 | Grad Max: 1.235720 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000400 | Grad Max: 0.012793 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020303 | Grad Max: 0.090065 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000594 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004490 | Grad Max: 0.009589 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000261 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001289 | Grad Max: 0.003065 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001512 | Grad Max: 0.003144 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029106 | Grad Max: 0.029106 [GRADIENT NORM TOTAL] 6.7047 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.561 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.501988 0.49801198] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.047 [MASKS] A(Pass/Fail): 641/1407 | B: 556/1492 | C: 256/1792 [LOSS Ex1] A: 0.65488 | B: 0.65637 | C: 0.65228 [LOGITS Ex2 A] Mean Abs: 1.777 | Max: 7.485 [LOSS Ex2] A: 0.18218 | B: 0.38715 | C: 0.29815 ** [JOINT LOSS] ** : 0.943668 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002697 | Grad Max: 0.089653 -> Layer: shared_layers.0.bias | Grad Mean: 0.093267 | Grad Max: 0.395723 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002200 | Grad Max: 0.007326 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002241 | Grad Max: 0.002241 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000723 | Grad Max: 0.179864 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011973 | Grad Max: 1.005135 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.003873 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003273 | Grad Max: 0.019951 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000225 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000785 | Grad Max: 0.002920 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000232 | Grad Max: 0.000758 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000341 | Grad Max: 0.001295 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005656 | Grad Max: 0.005656 [GRADIENT NORM TOTAL] 2.3150 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.469 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50414985 0.49585018] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.538 | Std: 0.046 [MASKS] A(Pass/Fail): 624/1424 | B: 551/1497 | C: 277/1771 [LOSS Ex1] A: 0.66113 | B: 0.65280 | C: 0.65175 [LOGITS Ex2 A] Mean Abs: 1.715 | Max: 6.010 [LOSS Ex2] A: 0.17212 | B: 0.38522 | C: 0.28686 ** [JOINT LOSS] ** : 0.936629 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006032 | Grad Max: 0.150363 -> Layer: shared_layers.0.bias | Grad Mean: 0.317297 | Grad Max: 1.281781 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.006162 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002144 | Grad Max: 0.002144 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.324702 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042218 | Grad Max: 1.844111 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000406 | Grad Max: 0.012556 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020696 | Grad Max: 0.089555 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000641 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004581 | Grad Max: 0.010216 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000304 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001306 | Grad Max: 0.003227 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001618 | Grad Max: 0.003191 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029267 | Grad Max: 0.029267 [GRADIENT NORM TOTAL] 6.6134 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.085 | Max: 0.399 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5341344 0.4658656] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.537 | Std: 0.045 [MASKS] A(Pass/Fail): 612/1436 | B: 507/1349 | C: 187/1189 [LOSS Ex1] A: 0.66088 | B: 0.65628 | C: 0.64872 [LOGITS Ex2 A] Mean Abs: 1.689 | Max: 6.012 [LOSS Ex2] A: 0.18180 | B: 0.37767 | C: 0.30487 ** [JOINT LOSS] ** : 0.943408 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007807 | Grad Max: 0.216133 -> Layer: shared_layers.0.bias | Grad Mean: 0.369193 | Grad Max: 1.449482 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.006344 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001806 | Grad Max: 0.001806 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002512 | Grad Max: 0.351922 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046517 | Grad Max: 1.991943 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000444 | Grad Max: 0.012351 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022345 | Grad Max: 0.094476 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000663 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004991 | Grad Max: 0.010198 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000332 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001443 | Grad Max: 0.003525 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001803 | Grad Max: 0.003239 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033635 | Grad Max: 0.033635 [GRADIENT NORM TOTAL] 7.2816 [EPOCH SUMMARY] Train Loss: 0.9409 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9196 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9219 -> New: 0.9196) ############################## EPOCH 83/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.533 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.66108716 0.33891287] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.047 [MASKS] A(Pass/Fail): 681/1367 | B: 536/1512 | C: 270/1778 [LOSS Ex1] A: 0.65658 | B: 0.65550 | C: 0.65271 [LOGITS Ex2 A] Mean Abs: 1.777 | Max: 6.039 [LOSS Ex2] A: 0.16655 | B: 0.38797 | C: 0.29413 ** [JOINT LOSS] ** : 0.937814 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002622 | Grad Max: 0.081762 -> Layer: shared_layers.0.bias | Grad Mean: 0.053166 | Grad Max: 0.292935 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002162 | Grad Max: 0.006792 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004396 | Grad Max: 0.004396 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000500 | Grad Max: 0.085670 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007964 | Grad Max: 0.483320 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.002133 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001921 | Grad Max: 0.014551 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000154 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000419 | Grad Max: 0.001790 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000068 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000124 | Grad Max: 0.000544 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000372 | Grad Max: 0.001223 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002892 | Grad Max: 0.002892 [GRADIENT NORM TOTAL] 1.3592 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.598 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50044656 0.4995535 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 657/1391 | B: 558/1490 | C: 278/1770 [LOSS Ex1] A: 0.66155 | B: 0.65620 | C: 0.64927 [LOGITS Ex2 A] Mean Abs: 1.810 | Max: 5.947 [LOSS Ex2] A: 0.16631 | B: 0.39600 | C: 0.27261 ** [JOINT LOSS] ** : 0.933976 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004741 | Grad Max: 0.161372 -> Layer: shared_layers.0.bias | Grad Mean: 0.435705 | Grad Max: 2.076736 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.005897 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000312 | Grad Max: 0.000312 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002753 | Grad Max: 0.294160 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051950 | Grad Max: 1.685707 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.016290 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024778 | Grad Max: 0.126564 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000671 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005387 | Grad Max: 0.011297 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000307 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001556 | Grad Max: 0.003509 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001857 | Grad Max: 0.004021 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036150 | Grad Max: 0.036150 [GRADIENT NORM TOTAL] 8.8482 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.389 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6318114 0.36818856] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.047 [MASKS] A(Pass/Fail): 641/1407 | B: 552/1496 | C: 297/1751 [LOSS Ex1] A: 0.65864 | B: 0.65262 | C: 0.64838 [LOGITS Ex2 A] Mean Abs: 1.812 | Max: 6.115 [LOSS Ex2] A: 0.18800 | B: 0.38922 | C: 0.31829 ** [JOINT LOSS] ** : 0.951715 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007302 | Grad Max: 0.175053 -> Layer: shared_layers.0.bias | Grad Mean: 0.505502 | Grad Max: 2.213763 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002240 | Grad Max: 0.006913 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003582 | Grad Max: 0.003582 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003352 | Grad Max: 0.335153 -> Layer: exit2_layers.0.bias | Grad Mean: 0.062429 | Grad Max: 1.872115 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000581 | Grad Max: 0.019302 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029777 | Grad Max: 0.146140 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000802 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006553 | Grad Max: 0.013387 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000386 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001905 | Grad Max: 0.004346 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002391 | Grad Max: 0.004457 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044892 | Grad Max: 0.044892 [GRADIENT NORM TOTAL] 10.0410 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.465 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5834521 0.4165479] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.047 [MASKS] A(Pass/Fail): 527/1089 | B: 508/1348 | C: 271/1777 [LOSS Ex1] A: 0.65729 | B: 0.65610 | C: 0.65167 [LOGITS Ex2 A] Mean Abs: 1.827 | Max: 5.801 [LOSS Ex2] A: 0.16483 | B: 0.37239 | C: 0.31568 ** [JOINT LOSS] ** : 0.939321 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004422 | Grad Max: 0.135030 -> Layer: shared_layers.0.bias | Grad Mean: 0.184235 | Grad Max: 0.736672 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002143 | Grad Max: 0.007038 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003751 | Grad Max: 0.003751 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001372 | Grad Max: 0.154648 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025140 | Grad Max: 0.863260 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.006666 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011095 | Grad Max: 0.050437 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000379 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002471 | Grad Max: 0.005666 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000162 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000720 | Grad Max: 0.001731 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000930 | Grad Max: 0.002316 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016741 | Grad Max: 0.016741 [GRADIENT NORM TOTAL] 3.8056 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.600 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5066451 0.49335495] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 656/1392 | B: 537/1511 | C: 263/1785 [LOSS Ex1] A: 0.65761 | B: 0.65532 | C: 0.65100 [LOGITS Ex2 A] Mean Abs: 1.780 | Max: 7.449 [LOSS Ex2] A: 0.16512 | B: 0.40602 | C: 0.29181 ** [JOINT LOSS] ** : 0.942293 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005520 | Grad Max: 0.158887 -> Layer: shared_layers.0.bias | Grad Mean: 0.472836 | Grad Max: 2.148050 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.006694 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001117 | Grad Max: 0.001117 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002781 | Grad Max: 0.366372 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052226 | Grad Max: 2.063204 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000516 | Grad Max: 0.017848 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026552 | Grad Max: 0.140814 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000780 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005722 | Grad Max: 0.012057 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000374 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001615 | Grad Max: 0.004076 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001915 | Grad Max: 0.003487 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036620 | Grad Max: 0.036620 [GRADIENT NORM TOTAL] 9.0908 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.550 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5096977 0.49030238] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.047 [MASKS] A(Pass/Fail): 654/1394 | B: 560/1488 | C: 280/1768 [LOSS Ex1] A: 0.65558 | B: 0.65603 | C: 0.65204 [LOGITS Ex2 A] Mean Abs: 1.754 | Max: 6.032 [LOSS Ex2] A: 0.16430 | B: 0.42602 | C: 0.29373 ** [JOINT LOSS] ** : 0.949233 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006684 | Grad Max: 0.202576 -> Layer: shared_layers.0.bias | Grad Mean: 0.632596 | Grad Max: 2.748610 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002213 | Grad Max: 0.007429 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006166 | Grad Max: 0.006166 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003863 | Grad Max: 0.404063 -> Layer: exit2_layers.0.bias | Grad Mean: 0.073093 | Grad Max: 2.279216 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000721 | Grad Max: 0.023138 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037285 | Grad Max: 0.172367 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001040 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008078 | Grad Max: 0.017286 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000471 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002309 | Grad Max: 0.005287 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002725 | Grad Max: 0.005312 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052785 | Grad Max: 0.052785 [GRADIENT NORM TOTAL] 12.2945 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.567 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5020137 0.49798632] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.047 [MASKS] A(Pass/Fail): 644/1404 | B: 553/1495 | C: 276/1772 [LOSS Ex1] A: 0.65458 | B: 0.65245 | C: 0.64994 [LOGITS Ex2 A] Mean Abs: 1.747 | Max: 6.690 [LOSS Ex2] A: 0.18520 | B: 0.39927 | C: 0.30207 ** [JOINT LOSS] ** : 0.947838 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005345 | Grad Max: 0.164002 -> Layer: shared_layers.0.bias | Grad Mean: 0.467742 | Grad Max: 2.181714 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002343 | Grad Max: 0.006927 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001186 | Grad Max: 0.001186 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002877 | Grad Max: 0.374071 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053277 | Grad Max: 2.100256 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000513 | Grad Max: 0.017129 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026373 | Grad Max: 0.138311 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000813 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005628 | Grad Max: 0.012299 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000334 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001605 | Grad Max: 0.003697 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001942 | Grad Max: 0.003318 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037344 | Grad Max: 0.037344 [GRADIENT NORM TOTAL] 9.2168 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.474 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50411963 0.4958804 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.538 | Std: 0.046 [MASKS] A(Pass/Fail): 624/1424 | B: 508/1348 | C: 298/1750 [LOSS Ex1] A: 0.66088 | B: 0.65595 | C: 0.64852 [LOGITS Ex2 A] Mean Abs: 1.769 | Max: 5.597 [LOSS Ex2] A: 0.16285 | B: 0.37288 | C: 0.29973 ** [JOINT LOSS] ** : 0.933600 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001699 | Grad Max: 0.053940 -> Layer: shared_layers.0.bias | Grad Mean: 0.059197 | Grad Max: 0.272332 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.006060 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004412 | Grad Max: 0.004412 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000534 | Grad Max: 0.086376 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009237 | Grad Max: 0.485125 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002439 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001394 | Grad Max: 0.012988 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000144 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000235 | Grad Max: 0.001521 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000074 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000070 | Grad Max: 0.000521 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000376 | Grad Max: 0.001041 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000413 | Grad Max: 0.000413 [GRADIENT NORM TOTAL] 1.6288 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.086 | Max: 0.403 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5343707 0.46562928] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.538 | Std: 0.045 [MASKS] A(Pass/Fail): 614/1434 | B: 538/1510 | C: 282/1766 [LOSS Ex1] A: 0.66063 | B: 0.65516 | C: 0.65107 [LOGITS Ex2 A] Mean Abs: 1.779 | Max: 5.379 [LOSS Ex2] A: 0.18569 | B: 0.40410 | C: 0.32434 ** [JOINT LOSS] ** : 0.960326 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007567 | Grad Max: 0.172572 -> Layer: shared_layers.0.bias | Grad Mean: 0.433503 | Grad Max: 1.714309 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.006349 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003115 | Grad Max: 0.003115 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002795 | Grad Max: 0.278566 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051780 | Grad Max: 1.471511 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000512 | Grad Max: 0.016698 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026047 | Grad Max: 0.128368 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000797 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005724 | Grad Max: 0.012545 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000366 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001639 | Grad Max: 0.004058 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002004 | Grad Max: 0.003611 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037072 | Grad Max: 0.037072 [GRADIENT NORM TOTAL] 8.1119 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.537 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.66263825 0.33736175] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.048 [MASKS] A(Pass/Fail): 682/1366 | B: 560/1488 | C: 267/1781 [LOSS Ex1] A: 0.65630 | B: 0.65587 | C: 0.65207 [LOGITS Ex2 A] Mean Abs: 1.834 | Max: 5.959 [LOSS Ex2] A: 0.16961 | B: 0.40310 | C: 0.29367 ** [JOINT LOSS] ** : 0.943537 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006316 | Grad Max: 0.162133 -> Layer: shared_layers.0.bias | Grad Mean: 0.456611 | Grad Max: 1.830347 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002227 | Grad Max: 0.007174 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008691 | Grad Max: 0.008691 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002874 | Grad Max: 0.295177 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053149 | Grad Max: 1.647207 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000542 | Grad Max: 0.017463 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027833 | Grad Max: 0.142669 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000718 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006133 | Grad Max: 0.012556 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000357 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001776 | Grad Max: 0.003922 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002146 | Grad Max: 0.004234 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041032 | Grad Max: 0.041032 [GRADIENT NORM TOTAL] 8.7434 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.603 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50048244 0.49951756] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.047 [MASKS] A(Pass/Fail): 658/1390 | B: 554/1494 | C: 297/1751 [LOSS Ex1] A: 0.66130 | B: 0.65229 | C: 0.64905 [LOGITS Ex2 A] Mean Abs: 1.806 | Max: 5.686 [LOSS Ex2] A: 0.16455 | B: 0.37203 | C: 0.27610 ** [JOINT LOSS] ** : 0.925104 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003013 | Grad Max: 0.092531 -> Layer: shared_layers.0.bias | Grad Mean: 0.140499 | Grad Max: 0.520390 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002209 | Grad Max: 0.006805 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008023 | Grad Max: 0.008023 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001061 | Grad Max: 0.228574 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018470 | Grad Max: 1.275674 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000134 | Grad Max: 0.005831 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006664 | Grad Max: 0.045544 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000287 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001339 | Grad Max: 0.003784 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000381 | Grad Max: 0.001052 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000513 | Grad Max: 0.001713 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008780 | Grad Max: 0.008780 [GRADIENT NORM TOTAL] 3.3142 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.393 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6329708 0.36702922] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.047 [MASKS] A(Pass/Fail): 642/1406 | B: 508/1348 | C: 279/1769 [LOSS Ex1] A: 0.65837 | B: 0.65579 | C: 0.64994 [LOGITS Ex2 A] Mean Abs: 1.752 | Max: 6.140 [LOSS Ex2] A: 0.18807 | B: 0.38013 | C: 0.30552 ** [JOINT LOSS] ** : 0.945941 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008201 | Grad Max: 0.240744 -> Layer: shared_layers.0.bias | Grad Mean: 0.462725 | Grad Max: 2.047340 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.007180 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004799 | Grad Max: 0.004799 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003026 | Grad Max: 0.382904 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056075 | Grad Max: 2.142964 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000531 | Grad Max: 0.015534 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027123 | Grad Max: 0.117988 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000739 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006033 | Grad Max: 0.012423 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000353 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001735 | Grad Max: 0.004007 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002139 | Grad Max: 0.003591 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039762 | Grad Max: 0.039762 [GRADIENT NORM TOTAL] 9.1240 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.469 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58422786 0.4157721 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.048 [MASKS] A(Pass/Fail): 530/1086 | B: 538/1510 | C: 247/1801 [LOSS Ex1] A: 0.65702 | B: 0.65500 | C: 0.65391 [LOGITS Ex2 A] Mean Abs: 1.795 | Max: 6.171 [LOSS Ex2] A: 0.17504 | B: 0.42150 | C: 0.31347 ** [JOINT LOSS] ** : 0.958646 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010125 | Grad Max: 0.284202 -> Layer: shared_layers.0.bias | Grad Mean: 0.593720 | Grad Max: 2.488721 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002120 | Grad Max: 0.006767 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002535 | Grad Max: 0.002535 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003883 | Grad Max: 0.505117 -> Layer: exit2_layers.0.bias | Grad Mean: 0.071966 | Grad Max: 2.800072 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000685 | Grad Max: 0.019504 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035050 | Grad Max: 0.164058 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000996 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007775 | Grad Max: 0.016405 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000453 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002250 | Grad Max: 0.004982 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002864 | Grad Max: 0.005658 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052888 | Grad Max: 0.052888 [GRADIENT NORM TOTAL] 11.5784 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.604 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50669533 0.4933046 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 661/1387 | B: 560/1488 | C: 180/1196 [LOSS Ex1] A: 0.65735 | B: 0.65572 | C: 0.65063 [LOGITS Ex2 A] Mean Abs: 1.781 | Max: 5.874 [LOSS Ex2] A: 0.16606 | B: 0.39763 | C: 0.29773 ** [JOINT LOSS] ** : 0.941706 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006566 | Grad Max: 0.174344 -> Layer: shared_layers.0.bias | Grad Mean: 0.364602 | Grad Max: 1.489672 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.006289 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000901 | Grad Max: 0.000901 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002485 | Grad Max: 0.277678 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046136 | Grad Max: 1.515424 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000442 | Grad Max: 0.016310 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022577 | Grad Max: 0.122459 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000657 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004988 | Grad Max: 0.010293 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000281 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001447 | Grad Max: 0.003284 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001883 | Grad Max: 0.003451 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034556 | Grad Max: 0.034556 [GRADIENT NORM TOTAL] 7.1693 [EPOCH SUMMARY] Train Loss: 0.9436 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9203 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 84/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.555 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5097768 0.49022323] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 657/1391 | B: 555/1493 | C: 301/1747 [LOSS Ex1] A: 0.65531 | B: 0.65214 | C: 0.64687 [LOGITS Ex2 A] Mean Abs: 1.818 | Max: 6.568 [LOSS Ex2] A: 0.17049 | B: 0.36551 | C: 0.30451 ** [JOINT LOSS] ** : 0.931606 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002891 | Grad Max: 0.080780 -> Layer: shared_layers.0.bias | Grad Mean: 0.227049 | Grad Max: 1.069606 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002354 | Grad Max: 0.007033 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000328 | Grad Max: 0.000328 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001428 | Grad Max: 0.169587 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026548 | Grad Max: 0.952679 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000242 | Grad Max: 0.010801 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012478 | Grad Max: 0.073207 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000418 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002680 | Grad Max: 0.006001 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000189 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000760 | Grad Max: 0.002076 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000832 | Grad Max: 0.002161 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016654 | Grad Max: 0.016654 [GRADIENT NORM TOTAL] 4.5592 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.572 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5020575 0.4979425] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.047 [MASKS] A(Pass/Fail): 648/1400 | B: 509/1347 | C: 280/1768 [LOSS Ex1] A: 0.65431 | B: 0.65565 | C: 0.65088 [LOGITS Ex2 A] Mean Abs: 1.825 | Max: 7.315 [LOSS Ex2] A: 0.19226 | B: 0.36604 | C: 0.30118 ** [JOINT LOSS] ** : 0.940105 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005924 | Grad Max: 0.139214 -> Layer: shared_layers.0.bias | Grad Mean: 0.377178 | Grad Max: 1.744680 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.007051 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003563 | Grad Max: 0.003563 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002416 | Grad Max: 0.233308 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044826 | Grad Max: 1.289653 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000438 | Grad Max: 0.014844 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022577 | Grad Max: 0.114792 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000677 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004981 | Grad Max: 0.010689 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000312 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001443 | Grad Max: 0.003406 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001782 | Grad Max: 0.003363 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033313 | Grad Max: 0.033313 [GRADIENT NORM TOTAL] 7.3642 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.477 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041263 0.4958737] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.538 | Std: 0.047 [MASKS] A(Pass/Fail): 626/1422 | B: 539/1509 | C: 264/1784 [LOSS Ex1] A: 0.66064 | B: 0.65487 | C: 0.65191 [LOGITS Ex2 A] Mean Abs: 1.804 | Max: 5.959 [LOSS Ex2] A: 0.16755 | B: 0.38813 | C: 0.28832 ** [JOINT LOSS] ** : 0.937141 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003756 | Grad Max: 0.129616 -> Layer: shared_layers.0.bias | Grad Mean: 0.225316 | Grad Max: 1.091992 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002032 | Grad Max: 0.005748 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000610 | Grad Max: 0.000610 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001408 | Grad Max: 0.148753 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026269 | Grad Max: 0.820112 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.009459 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013144 | Grad Max: 0.074282 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000404 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002865 | Grad Max: 0.005981 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000187 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000823 | Grad Max: 0.001942 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000964 | Grad Max: 0.002524 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018813 | Grad Max: 0.018813 [GRADIENT NORM TOTAL] 4.4240 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.406 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53463125 0.46536872] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.538 | Std: 0.046 [MASKS] A(Pass/Fail): 615/1433 | B: 561/1487 | C: 273/1775 [LOSS Ex1] A: 0.66040 | B: 0.65559 | C: 0.65071 [LOGITS Ex2 A] Mean Abs: 1.717 | Max: 6.146 [LOSS Ex2] A: 0.17972 | B: 0.39961 | C: 0.29802 ** [JOINT LOSS] ** : 0.948018 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003424 | Grad Max: 0.096112 -> Layer: shared_layers.0.bias | Grad Mean: 0.301752 | Grad Max: 1.244625 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.006328 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005556 | Grad Max: 0.005556 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001920 | Grad Max: 0.253342 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036009 | Grad Max: 1.400724 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000347 | Grad Max: 0.012915 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018014 | Grad Max: 0.094839 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000510 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003915 | Grad Max: 0.008256 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000235 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001135 | Grad Max: 0.002643 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001449 | Grad Max: 0.002622 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027296 | Grad Max: 0.027296 [GRADIENT NORM TOTAL] 5.8941 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.541 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.66400915 0.3359909 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.048 [MASKS] A(Pass/Fail): 682/1366 | B: 556/1492 | C: 270/1778 [LOSS Ex1] A: 0.65604 | B: 0.65200 | C: 0.64929 [LOGITS Ex2 A] Mean Abs: 1.777 | Max: 6.546 [LOSS Ex2] A: 0.16034 | B: 0.38515 | C: 0.28941 ** [JOINT LOSS] ** : 0.930745 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005143 | Grad Max: 0.113626 -> Layer: shared_layers.0.bias | Grad Mean: 0.403334 | Grad Max: 1.620932 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002332 | Grad Max: 0.007016 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002944 | Grad Max: 0.002944 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002489 | Grad Max: 0.305103 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047049 | Grad Max: 1.700200 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000447 | Grad Max: 0.013144 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023105 | Grad Max: 0.107401 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000680 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005030 | Grad Max: 0.010346 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000296 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001445 | Grad Max: 0.003373 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001761 | Grad Max: 0.003394 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033182 | Grad Max: 0.033182 [GRADIENT NORM TOTAL] 7.6532 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.607 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004343 0.49956572] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.047 [MASKS] A(Pass/Fail): 660/1388 | B: 512/1344 | C: 261/1787 [LOSS Ex1] A: 0.66107 | B: 0.65552 | C: 0.65270 [LOGITS Ex2 A] Mean Abs: 1.798 | Max: 6.785 [LOSS Ex2] A: 0.15181 | B: 0.37341 | C: 0.31324 ** [JOINT LOSS] ** : 0.935917 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001809 | Grad Max: 0.044971 -> Layer: shared_layers.0.bias | Grad Mean: 0.144278 | Grad Max: 0.560731 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.006219 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004490 | Grad Max: 0.004490 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000930 | Grad Max: 0.119309 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016817 | Grad Max: 0.671055 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.006542 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007363 | Grad Max: 0.048967 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000272 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001598 | Grad Max: 0.003865 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000144 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000469 | Grad Max: 0.001457 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000619 | Grad Max: 0.001630 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011681 | Grad Max: 0.011681 [GRADIENT NORM TOTAL] 2.8517 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.396 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6339124 0.3660876] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 644/1404 | B: 539/1509 | C: 287/1761 [LOSS Ex1] A: 0.65814 | B: 0.65474 | C: 0.64871 [LOGITS Ex2 A] Mean Abs: 1.823 | Max: 6.080 [LOSS Ex2] A: 0.18349 | B: 0.39300 | C: 0.30871 ** [JOINT LOSS] ** : 0.948928 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005557 | Grad Max: 0.145253 -> Layer: shared_layers.0.bias | Grad Mean: 0.407777 | Grad Max: 1.880695 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002170 | Grad Max: 0.006887 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002707 | Grad Max: 0.002707 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002662 | Grad Max: 0.274344 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049486 | Grad Max: 1.526621 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000469 | Grad Max: 0.016278 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024263 | Grad Max: 0.134811 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000725 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005301 | Grad Max: 0.010885 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000299 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001531 | Grad Max: 0.003422 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001915 | Grad Max: 0.003405 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035526 | Grad Max: 0.035526 [GRADIENT NORM TOTAL] 8.1892 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.472 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5847473 0.4152527] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.048 [MASKS] A(Pass/Fail): 530/1086 | B: 563/1485 | C: 290/1758 [LOSS Ex1] A: 0.65678 | B: 0.65546 | C: 0.64920 [LOGITS Ex2 A] Mean Abs: 1.861 | Max: 5.270 [LOSS Ex2] A: 0.16918 | B: 0.40049 | C: 0.31807 ** [JOINT LOSS] ** : 0.949722 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007057 | Grad Max: 0.204095 -> Layer: shared_layers.0.bias | Grad Mean: 0.535982 | Grad Max: 2.553962 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.006737 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004335 | Grad Max: 0.004335 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003452 | Grad Max: 0.336403 -> Layer: exit2_layers.0.bias | Grad Mean: 0.063999 | Grad Max: 1.911792 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000586 | Grad Max: 0.018530 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030297 | Grad Max: 0.142962 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000875 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006690 | Grad Max: 0.014084 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000382 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001920 | Grad Max: 0.004523 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002267 | Grad Max: 0.004120 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043277 | Grad Max: 0.043277 [GRADIENT NORM TOTAL] 10.7333 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.608 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067628 0.4932372] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.048 [MASKS] A(Pass/Fail): 661/1387 | B: 557/1491 | C: 273/1775 [LOSS Ex1] A: 0.65712 | B: 0.65186 | C: 0.64960 [LOGITS Ex2 A] Mean Abs: 1.823 | Max: 6.869 [LOSS Ex2] A: 0.16803 | B: 0.37817 | C: 0.29352 ** [JOINT LOSS] ** : 0.932767 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004845 | Grad Max: 0.148016 -> Layer: shared_layers.0.bias | Grad Mean: 0.351495 | Grad Max: 1.663359 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.006836 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006575 | Grad Max: 0.006575 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.242251 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041686 | Grad Max: 1.363715 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.014889 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020134 | Grad Max: 0.110313 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000545 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004428 | Grad Max: 0.008924 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000277 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001276 | Grad Max: 0.003171 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001501 | Grad Max: 0.003375 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029182 | Grad Max: 0.029182 [GRADIENT NORM TOTAL] 7.0651 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.559 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50981414 0.49018592] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 657/1391 | B: 512/1344 | C: 269/1779 [LOSS Ex1] A: 0.65505 | B: 0.65539 | C: 0.65169 [LOGITS Ex2 A] Mean Abs: 1.775 | Max: 6.692 [LOSS Ex2] A: 0.16507 | B: 0.37184 | C: 0.29435 ** [JOINT LOSS] ** : 0.931131 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002852 | Grad Max: 0.070220 -> Layer: shared_layers.0.bias | Grad Mean: 0.221271 | Grad Max: 0.975239 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.007294 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006122 | Grad Max: 0.006122 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001415 | Grad Max: 0.189478 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026266 | Grad Max: 1.078609 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000234 | Grad Max: 0.009658 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012176 | Grad Max: 0.071733 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000420 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002668 | Grad Max: 0.006260 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000183 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000756 | Grad Max: 0.001906 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000832 | Grad Max: 0.002035 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016029 | Grad Max: 0.016029 [GRADIENT NORM TOTAL] 4.5358 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.576 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5021423 0.49785763] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 649/1399 | B: 541/1507 | C: 261/1787 [LOSS Ex1] A: 0.65405 | B: 0.65461 | C: 0.65186 [LOGITS Ex2 A] Mean Abs: 1.768 | Max: 7.075 [LOSS Ex2] A: 0.18829 | B: 0.40786 | C: 0.29326 ** [JOINT LOSS] ** : 0.949980 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005642 | Grad Max: 0.170055 -> Layer: shared_layers.0.bias | Grad Mean: 0.385804 | Grad Max: 1.797332 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002236 | Grad Max: 0.007480 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005884 | Grad Max: 0.005884 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002414 | Grad Max: 0.286624 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045641 | Grad Max: 1.549579 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000417 | Grad Max: 0.014059 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021606 | Grad Max: 0.111518 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000654 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004697 | Grad Max: 0.009922 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000268 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001353 | Grad Max: 0.003091 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001696 | Grad Max: 0.003142 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031357 | Grad Max: 0.031357 [GRADIENT NORM TOTAL] 7.6559 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.480 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041737 0.4958263] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.047 [MASKS] A(Pass/Fail): 627/1421 | B: 563/1485 | C: 299/1749 [LOSS Ex1] A: 0.66043 | B: 0.65534 | C: 0.65009 [LOGITS Ex2 A] Mean Abs: 1.750 | Max: 6.190 [LOSS Ex2] A: 0.15665 | B: 0.38902 | C: 0.28469 ** [JOINT LOSS] ** : 0.932071 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003569 | Grad Max: 0.097914 -> Layer: shared_layers.0.bias | Grad Mean: 0.176771 | Grad Max: 0.750533 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.005750 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003961 | Grad Max: 0.003961 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001227 | Grad Max: 0.199659 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022758 | Grad Max: 1.122902 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000215 | Grad Max: 0.008156 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011087 | Grad Max: 0.055936 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000342 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002431 | Grad Max: 0.005651 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000169 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000703 | Grad Max: 0.001770 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000973 | Grad Max: 0.002399 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016978 | Grad Max: 0.016978 [GRADIENT NORM TOTAL] 3.7905 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.409 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5347846 0.46521538] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.538 | Std: 0.046 [MASKS] A(Pass/Fail): 618/1430 | B: 557/1491 | C: 299/1749 [LOSS Ex1] A: 0.66020 | B: 0.65173 | C: 0.64746 [LOGITS Ex2 A] Mean Abs: 1.757 | Max: 6.147 [LOSS Ex2] A: 0.17407 | B: 0.36926 | C: 0.28423 ** [JOINT LOSS] ** : 0.928982 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004923 | Grad Max: 0.125996 -> Layer: shared_layers.0.bias | Grad Mean: 0.386704 | Grad Max: 1.571955 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.006436 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005506 | Grad Max: 0.005506 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002394 | Grad Max: 0.283996 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044941 | Grad Max: 1.533026 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000426 | Grad Max: 0.016113 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022181 | Grad Max: 0.125093 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000629 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004849 | Grad Max: 0.010594 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000280 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001399 | Grad Max: 0.003269 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001597 | Grad Max: 0.003582 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031488 | Grad Max: 0.031488 [GRADIENT NORM TOTAL] 7.5523 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.544 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6651136 0.33488637] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.048 [MASKS] A(Pass/Fail): 685/1363 | B: 513/1343 | C: 171/1205 [LOSS Ex1] A: 0.65583 | B: 0.65527 | C: 0.65288 [LOGITS Ex2 A] Mean Abs: 1.804 | Max: 6.240 [LOSS Ex2] A: 0.17176 | B: 0.37314 | C: 0.28313 ** [JOINT LOSS] ** : 0.930669 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007322 | Grad Max: 0.180582 -> Layer: shared_layers.0.bias | Grad Mean: 0.461556 | Grad Max: 1.864977 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.007000 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005150 | Grad Max: 0.005150 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002978 | Grad Max: 0.299937 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055297 | Grad Max: 1.646744 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000529 | Grad Max: 0.017806 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027446 | Grad Max: 0.144206 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000741 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006113 | Grad Max: 0.011975 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000329 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001794 | Grad Max: 0.004019 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002169 | Grad Max: 0.004553 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041783 | Grad Max: 0.041783 [GRADIENT NORM TOTAL] 8.9401 [EPOCH SUMMARY] Train Loss: 0.9377 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9196 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9196 -> New: 0.9196) ############################## EPOCH 85/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.611 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50039464 0.49960533] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 662/1386 | B: 542/1506 | C: 296/1752 [LOSS Ex1] A: 0.66088 | B: 0.65449 | C: 0.64767 [LOGITS Ex2 A] Mean Abs: 1.795 | Max: 5.820 [LOSS Ex2] A: 0.16130 | B: 0.39623 | C: 0.30902 ** [JOINT LOSS] ** : 0.943199 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002925 | Grad Max: 0.080279 -> Layer: shared_layers.0.bias | Grad Mean: 0.245247 | Grad Max: 0.993112 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.006140 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001621 | Grad Max: 0.001621 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001541 | Grad Max: 0.147479 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028614 | Grad Max: 0.821046 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000272 | Grad Max: 0.008957 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014195 | Grad Max: 0.073036 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000453 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003081 | Grad Max: 0.006822 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000217 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000887 | Grad Max: 0.002346 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000983 | Grad Max: 0.002394 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019824 | Grad Max: 0.019824 [GRADIENT NORM TOTAL] 4.7515 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.399 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63472915 0.36527085] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 645/1403 | B: 563/1485 | C: 286/1762 [LOSS Ex1] A: 0.65793 | B: 0.65522 | C: 0.64731 [LOGITS Ex2 A] Mean Abs: 1.756 | Max: 5.848 [LOSS Ex2] A: 0.18105 | B: 0.39248 | C: 0.29020 ** [JOINT LOSS] ** : 0.941397 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005830 | Grad Max: 0.144003 -> Layer: shared_layers.0.bias | Grad Mean: 0.347118 | Grad Max: 1.647611 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.006435 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001272 | Grad Max: 0.001272 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.364225 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042758 | Grad Max: 2.036950 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000390 | Grad Max: 0.013523 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020097 | Grad Max: 0.100775 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000663 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004452 | Grad Max: 0.009699 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000283 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001291 | Grad Max: 0.003352 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001571 | Grad Max: 0.002949 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029610 | Grad Max: 0.029610 [GRADIENT NORM TOTAL] 7.0656 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.474 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5852907 0.41470936] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.048 [MASKS] A(Pass/Fail): 532/1084 | B: 557/1491 | C: 253/1795 [LOSS Ex1] A: 0.65657 | B: 0.65161 | C: 0.65571 [LOGITS Ex2 A] Mean Abs: 1.767 | Max: 6.160 [LOSS Ex2] A: 0.17511 | B: 0.39099 | C: 0.29126 ** [JOINT LOSS] ** : 0.940413 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008297 | Grad Max: 0.218964 -> Layer: shared_layers.0.bias | Grad Mean: 0.524342 | Grad Max: 2.093285 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.006589 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000919 | Grad Max: 0.000919 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003386 | Grad Max: 0.386591 -> Layer: exit2_layers.0.bias | Grad Mean: 0.063439 | Grad Max: 2.094940 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000605 | Grad Max: 0.019528 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031442 | Grad Max: 0.157913 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000828 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006921 | Grad Max: 0.013632 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000386 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002007 | Grad Max: 0.004402 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002480 | Grad Max: 0.004407 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046473 | Grad Max: 0.046473 [GRADIENT NORM TOTAL] 10.2472 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.612 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50679153 0.49320844] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 662/1386 | B: 513/1343 | C: 293/1755 [LOSS Ex1] A: 0.65692 | B: 0.65515 | C: 0.64734 [LOGITS Ex2 A] Mean Abs: 1.789 | Max: 5.801 [LOSS Ex2] A: 0.16845 | B: 0.37682 | C: 0.30470 ** [JOINT LOSS] ** : 0.936464 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007531 | Grad Max: 0.213606 -> Layer: shared_layers.0.bias | Grad Mean: 0.366047 | Grad Max: 1.431149 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.006941 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004224 | Grad Max: 0.004224 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002376 | Grad Max: 0.271496 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043656 | Grad Max: 1.484161 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000415 | Grad Max: 0.013380 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021320 | Grad Max: 0.098325 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000618 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004777 | Grad Max: 0.009637 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000317 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001390 | Grad Max: 0.003562 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001703 | Grad Max: 0.002992 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032102 | Grad Max: 0.032102 [GRADIENT NORM TOTAL] 6.9777 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.562 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.509851 0.49014902] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.048 [MASKS] A(Pass/Fail): 657/1391 | B: 543/1505 | C: 275/1773 [LOSS Ex1] A: 0.65485 | B: 0.65437 | C: 0.64997 [LOGITS Ex2 A] Mean Abs: 1.806 | Max: 6.394 [LOSS Ex2] A: 0.16694 | B: 0.38740 | C: 0.29520 ** [JOINT LOSS] ** : 0.936243 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002743 | Grad Max: 0.062419 -> Layer: shared_layers.0.bias | Grad Mean: 0.161622 | Grad Max: 0.828384 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002207 | Grad Max: 0.006384 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003889 | Grad Max: 0.003889 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001047 | Grad Max: 0.155894 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019150 | Grad Max: 0.884593 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000158 | Grad Max: 0.006366 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008156 | Grad Max: 0.047500 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000274 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001830 | Grad Max: 0.004639 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000134 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000535 | Grad Max: 0.001424 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000590 | Grad Max: 0.001813 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012170 | Grad Max: 0.012170 [GRADIENT NORM TOTAL] 3.3117 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.580 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5021919 0.49780813] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.048 [MASKS] A(Pass/Fail): 651/1397 | B: 563/1485 | C: 310/1738 [LOSS Ex1] A: 0.65385 | B: 0.65510 | C: 0.64614 [LOGITS Ex2 A] Mean Abs: 1.814 | Max: 6.428 [LOSS Ex2] A: 0.18668 | B: 0.39612 | C: 0.29257 ** [JOINT LOSS] ** : 0.943487 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007860 | Grad Max: 0.303933 -> Layer: shared_layers.0.bias | Grad Mean: 0.444824 | Grad Max: 1.837051 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002292 | Grad Max: 0.007282 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000491 | Grad Max: 0.000491 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003068 | Grad Max: 0.297850 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056213 | Grad Max: 1.690377 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000512 | Grad Max: 0.015826 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026307 | Grad Max: 0.124009 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000791 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005919 | Grad Max: 0.012702 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000352 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001719 | Grad Max: 0.004162 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001990 | Grad Max: 0.004022 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038662 | Grad Max: 0.038662 [GRADIENT NORM TOTAL] 9.0111 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.483 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041973 0.4958027] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.047 [MASKS] A(Pass/Fail): 628/1420 | B: 557/1491 | C: 245/1803 [LOSS Ex1] A: 0.66025 | B: 0.65149 | C: 0.65215 [LOGITS Ex2 A] Mean Abs: 1.785 | Max: 6.162 [LOSS Ex2] A: 0.16445 | B: 0.37024 | C: 0.28893 ** [JOINT LOSS] ** : 0.929175 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003738 | Grad Max: 0.117867 -> Layer: shared_layers.0.bias | Grad Mean: 0.187586 | Grad Max: 0.873235 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005648 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002412 | Grad Max: 0.002412 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001247 | Grad Max: 0.158090 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022876 | Grad Max: 0.839845 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.007941 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010651 | Grad Max: 0.056878 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000330 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002377 | Grad Max: 0.005094 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000167 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000694 | Grad Max: 0.001659 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000791 | Grad Max: 0.002614 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016025 | Grad Max: 0.016025 [GRADIENT NORM TOTAL] 3.7682 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.087 | Max: 0.412 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5349357 0.46506432] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.538 | Std: 0.046 [MASKS] A(Pass/Fail): 620/1428 | B: 513/1343 | C: 288/1760 [LOSS Ex1] A: 0.66001 | B: 0.65504 | C: 0.65041 [LOGITS Ex2 A] Mean Abs: 1.714 | Max: 6.062 [LOSS Ex2] A: 0.16967 | B: 0.37170 | C: 0.30777 ** [JOINT LOSS] ** : 0.938200 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003769 | Grad Max: 0.092711 -> Layer: shared_layers.0.bias | Grad Mean: 0.287839 | Grad Max: 1.173489 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002024 | Grad Max: 0.006005 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001436 | Grad Max: 0.001436 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001742 | Grad Max: 0.297394 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032610 | Grad Max: 1.679518 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000293 | Grad Max: 0.010035 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015334 | Grad Max: 0.075053 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000462 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003342 | Grad Max: 0.007835 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000227 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000959 | Grad Max: 0.002641 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001080 | Grad Max: 0.002401 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021341 | Grad Max: 0.021341 [GRADIENT NORM TOTAL] 5.7329 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.547 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.66616076 0.3338393 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.049 [MASKS] A(Pass/Fail): 686/1362 | B: 543/1505 | C: 267/1781 [LOSS Ex1] A: 0.65562 | B: 0.65426 | C: 0.64955 [LOGITS Ex2 A] Mean Abs: 1.750 | Max: 6.022 [LOSS Ex2] A: 0.16296 | B: 0.40958 | C: 0.27953 ** [JOINT LOSS] ** : 0.937168 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004074 | Grad Max: 0.126571 -> Layer: shared_layers.0.bias | Grad Mean: 0.396913 | Grad Max: 1.748523 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.007099 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006388 | Grad Max: 0.006388 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002445 | Grad Max: 0.338056 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045923 | Grad Max: 1.887187 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000423 | Grad Max: 0.014132 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022316 | Grad Max: 0.108614 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000655 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004865 | Grad Max: 0.010197 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000286 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001401 | Grad Max: 0.003409 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001641 | Grad Max: 0.003311 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031713 | Grad Max: 0.031713 [GRADIENT NORM TOTAL] 7.9458 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.614 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003738 0.49962622] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 662/1386 | B: 563/1485 | C: 268/1780 [LOSS Ex1] A: 0.66070 | B: 0.65499 | C: 0.65206 [LOGITS Ex2 A] Mean Abs: 1.779 | Max: 5.580 [LOSS Ex2] A: 0.15400 | B: 0.39717 | C: 0.28844 ** [JOINT LOSS] ** : 0.935783 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003422 | Grad Max: 0.100730 -> Layer: shared_layers.0.bias | Grad Mean: 0.170546 | Grad Max: 0.632470 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002009 | Grad Max: 0.005759 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000527 | Grad Max: 0.000527 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001111 | Grad Max: 0.204566 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019932 | Grad Max: 1.148082 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000176 | Grad Max: 0.006125 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008934 | Grad Max: 0.046858 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000334 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002015 | Grad Max: 0.005361 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000145 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000574 | Grad Max: 0.001585 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000649 | Grad Max: 0.001853 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012656 | Grad Max: 0.012656 [GRADIENT NORM TOTAL] 3.4402 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.402 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63549095 0.364509 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 645/1403 | B: 557/1491 | C: 275/1773 [LOSS Ex1] A: 0.65773 | B: 0.65137 | C: 0.65149 [LOGITS Ex2 A] Mean Abs: 1.803 | Max: 5.732 [LOSS Ex2] A: 0.18145 | B: 0.37555 | C: 0.28557 ** [JOINT LOSS] ** : 0.934391 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004363 | Grad Max: 0.125121 -> Layer: shared_layers.0.bias | Grad Mean: 0.361128 | Grad Max: 1.610136 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.006543 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000977 | Grad Max: 0.000977 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002305 | Grad Max: 0.236972 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043149 | Grad Max: 1.350258 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.012164 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020566 | Grad Max: 0.095078 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000583 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004498 | Grad Max: 0.009557 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000252 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001297 | Grad Max: 0.002966 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001478 | Grad Max: 0.003200 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029457 | Grad Max: 0.029457 [GRADIENT NORM TOTAL] 7.1588 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.477 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5857211 0.41427898] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.048 [MASKS] A(Pass/Fail): 533/1083 | B: 513/1343 | C: 270/1778 [LOSS Ex1] A: 0.65636 | B: 0.65492 | C: 0.64925 [LOGITS Ex2 A] Mean Abs: 1.862 | Max: 5.635 [LOSS Ex2] A: 0.17588 | B: 0.36996 | C: 0.31226 ** [JOINT LOSS] ** : 0.939545 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005976 | Grad Max: 0.159170 -> Layer: shared_layers.0.bias | Grad Mean: 0.458288 | Grad Max: 2.049618 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.006865 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001572 | Grad Max: 0.001572 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002946 | Grad Max: 0.288431 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055510 | Grad Max: 1.591266 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000502 | Grad Max: 0.017483 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026266 | Grad Max: 0.137491 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000751 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005775 | Grad Max: 0.012288 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000313 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001677 | Grad Max: 0.003649 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001917 | Grad Max: 0.003686 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037762 | Grad Max: 0.037762 [GRADIENT NORM TOTAL] 9.0633 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.615 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50685906 0.49314094] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 664/1384 | B: 544/1504 | C: 279/1769 [LOSS Ex1] A: 0.65672 | B: 0.65414 | C: 0.64943 [LOGITS Ex2 A] Mean Abs: 1.806 | Max: 6.585 [LOSS Ex2] A: 0.16408 | B: 0.38993 | C: 0.29863 ** [JOINT LOSS] ** : 0.937640 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002925 | Grad Max: 0.085773 -> Layer: shared_layers.0.bias | Grad Mean: 0.254093 | Grad Max: 0.981542 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002241 | Grad Max: 0.007075 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007081 | Grad Max: 0.007081 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001578 | Grad Max: 0.175106 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029332 | Grad Max: 0.980644 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000270 | Grad Max: 0.010197 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014130 | Grad Max: 0.073503 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000397 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003073 | Grad Max: 0.006920 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000189 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000902 | Grad Max: 0.002013 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001035 | Grad Max: 0.002564 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020949 | Grad Max: 0.020949 [GRADIENT NORM TOTAL] 4.9337 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.566 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50987995 0.49012005] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 657/1391 | B: 563/1485 | C: 208/1168 [LOSS Ex1] A: 0.65462 | B: 0.65488 | C: 0.64752 [LOGITS Ex2 A] Mean Abs: 1.771 | Max: 5.383 [LOSS Ex2] A: 0.16084 | B: 0.39900 | C: 0.30348 ** [JOINT LOSS] ** : 0.940114 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004468 | Grad Max: 0.116965 -> Layer: shared_layers.0.bias | Grad Mean: 0.341547 | Grad Max: 1.542865 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.006800 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001744 | Grad Max: 0.001744 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.250037 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041592 | Grad Max: 1.422090 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000390 | Grad Max: 0.012062 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020342 | Grad Max: 0.095090 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000623 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004490 | Grad Max: 0.009993 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000257 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001303 | Grad Max: 0.003144 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001486 | Grad Max: 0.002875 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029343 | Grad Max: 0.029343 [GRADIENT NORM TOTAL] 6.8264 [EPOCH SUMMARY] Train Loss: 0.9381 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9244 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 86/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.583 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50223905 0.49776092] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.048 [MASKS] A(Pass/Fail): 652/1396 | B: 557/1491 | C: 275/1773 [LOSS Ex1] A: 0.65363 | B: 0.65125 | C: 0.64881 [LOGITS Ex2 A] Mean Abs: 1.737 | Max: 6.561 [LOSS Ex2] A: 0.18662 | B: 0.39080 | C: 0.29311 ** [JOINT LOSS] ** : 0.941409 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005105 | Grad Max: 0.147720 -> Layer: shared_layers.0.bias | Grad Mean: 0.418377 | Grad Max: 2.047090 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002346 | Grad Max: 0.007466 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003013 | Grad Max: 0.003013 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002631 | Grad Max: 0.301526 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049612 | Grad Max: 1.678511 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.014436 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023526 | Grad Max: 0.114790 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000691 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005085 | Grad Max: 0.011396 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000319 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001452 | Grad Max: 0.003571 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001691 | Grad Max: 0.003123 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032441 | Grad Max: 0.032441 [GRADIENT NORM TOTAL] 8.3143 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.486 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50420094 0.4957991 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.048 [MASKS] A(Pass/Fail): 630/1418 | B: 514/1342 | C: 262/1786 [LOSS Ex1] A: 0.66006 | B: 0.65481 | C: 0.65186 [LOGITS Ex2 A] Mean Abs: 1.727 | Max: 6.020 [LOSS Ex2] A: 0.16692 | B: 0.36347 | C: 0.29925 ** [JOINT LOSS] ** : 0.932120 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003607 | Grad Max: 0.076069 -> Layer: shared_layers.0.bias | Grad Mean: 0.237405 | Grad Max: 1.108621 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002031 | Grad Max: 0.005617 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003129 | Grad Max: 0.003129 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001657 | Grad Max: 0.238215 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030976 | Grad Max: 1.332285 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000290 | Grad Max: 0.009575 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015169 | Grad Max: 0.067861 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000497 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003345 | Grad Max: 0.007745 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000252 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000976 | Grad Max: 0.002608 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001191 | Grad Max: 0.002717 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022491 | Grad Max: 0.022491 [GRADIENT NORM TOTAL] 5.0342 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.415 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5350715 0.4649285] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.046 [MASKS] A(Pass/Fail): 620/1428 | B: 544/1504 | C: 262/1786 [LOSS Ex1] A: 0.65983 | B: 0.65403 | C: 0.65286 [LOGITS Ex2 A] Mean Abs: 1.752 | Max: 5.918 [LOSS Ex2] A: 0.17617 | B: 0.38583 | C: 0.32062 ** [JOINT LOSS] ** : 0.949779 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004283 | Grad Max: 0.119861 -> Layer: shared_layers.0.bias | Grad Mean: 0.266828 | Grad Max: 1.207068 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002003 | Grad Max: 0.006108 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002345 | Grad Max: 0.002345 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001695 | Grad Max: 0.173010 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031268 | Grad Max: 0.891740 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000288 | Grad Max: 0.008211 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015033 | Grad Max: 0.069502 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000472 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003338 | Grad Max: 0.007122 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000214 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000955 | Grad Max: 0.002386 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001100 | Grad Max: 0.002550 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020975 | Grad Max: 0.020975 [GRADIENT NORM TOTAL] 5.1132 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.551 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.66726863 0.33273134] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.049 [MASKS] A(Pass/Fail): 687/1361 | B: 564/1484 | C: 281/1767 [LOSS Ex1] A: 0.65542 | B: 0.65476 | C: 0.64621 [LOGITS Ex2 A] Mean Abs: 1.807 | Max: 5.940 [LOSS Ex2] A: 0.17239 | B: 0.39300 | C: 0.29692 ** [JOINT LOSS] ** : 0.939569 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006331 | Grad Max: 0.185356 -> Layer: shared_layers.0.bias | Grad Mean: 0.452796 | Grad Max: 2.043562 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002339 | Grad Max: 0.007280 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010387 | Grad Max: 0.010387 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002930 | Grad Max: 0.307946 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054354 | Grad Max: 1.728766 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000511 | Grad Max: 0.016308 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026628 | Grad Max: 0.130027 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000730 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005924 | Grad Max: 0.012591 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000317 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001734 | Grad Max: 0.003745 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002022 | Grad Max: 0.004056 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039548 | Grad Max: 0.039548 [GRADIENT NORM TOTAL] 9.0571 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.618 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50037 0.49963] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 663/1385 | B: 557/1491 | C: 281/1767 [LOSS Ex1] A: 0.66052 | B: 0.65114 | C: 0.64864 [LOGITS Ex2 A] Mean Abs: 1.792 | Max: 6.058 [LOSS Ex2] A: 0.15685 | B: 0.36880 | C: 0.28514 ** [JOINT LOSS] ** : 0.923692 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002810 | Grad Max: 0.089864 -> Layer: shared_layers.0.bias | Grad Mean: 0.245488 | Grad Max: 1.133690 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006190 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001988 | Grad Max: 0.001988 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001562 | Grad Max: 0.180420 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029263 | Grad Max: 0.993164 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000271 | Grad Max: 0.009406 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014202 | Grad Max: 0.073577 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000442 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003081 | Grad Max: 0.007036 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000189 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000897 | Grad Max: 0.002088 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001010 | Grad Max: 0.002996 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020573 | Grad Max: 0.020573 [GRADIENT NORM TOTAL] 5.0335 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.405 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6363522 0.3636478] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.049 [MASKS] A(Pass/Fail): 647/1401 | B: 514/1342 | C: 266/1782 [LOSS Ex1] A: 0.65754 | B: 0.65470 | C: 0.65130 [LOGITS Ex2 A] Mean Abs: 1.734 | Max: 5.965 [LOSS Ex2] A: 0.18791 | B: 0.37988 | C: 0.31029 ** [JOINT LOSS] ** : 0.947205 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005183 | Grad Max: 0.130958 -> Layer: shared_layers.0.bias | Grad Mean: 0.310304 | Grad Max: 1.447340 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.006772 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006774 | Grad Max: 0.006774 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001950 | Grad Max: 0.310156 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036205 | Grad Max: 1.748706 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.010720 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017244 | Grad Max: 0.083989 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000482 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003777 | Grad Max: 0.008170 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000230 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001086 | Grad Max: 0.002631 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001235 | Grad Max: 0.002440 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023751 | Grad Max: 0.023751 [GRADIENT NORM TOTAL] 6.0913 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.480 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5862583 0.4137417] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.049 [MASKS] A(Pass/Fail): 534/1082 | B: 545/1503 | C: 290/1758 [LOSS Ex1] A: 0.65615 | B: 0.65392 | C: 0.65045 [LOGITS Ex2 A] Mean Abs: 1.785 | Max: 6.555 [LOSS Ex2] A: 0.17059 | B: 0.41277 | C: 0.29829 ** [JOINT LOSS] ** : 0.947390 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007399 | Grad Max: 0.188933 -> Layer: shared_layers.0.bias | Grad Mean: 0.531803 | Grad Max: 2.349885 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.006762 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006662 | Grad Max: 0.006662 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003376 | Grad Max: 0.454963 -> Layer: exit2_layers.0.bias | Grad Mean: 0.063376 | Grad Max: 2.553191 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000589 | Grad Max: 0.018130 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030820 | Grad Max: 0.144340 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000807 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006760 | Grad Max: 0.013431 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000398 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001961 | Grad Max: 0.004750 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002327 | Grad Max: 0.004136 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044817 | Grad Max: 0.044817 [GRADIENT NORM TOTAL] 10.5982 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.618 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50685745 0.49314258] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 664/1384 | B: 564/1484 | C: 301/1747 [LOSS Ex1] A: 0.65652 | B: 0.65466 | C: 0.64921 [LOGITS Ex2 A] Mean Abs: 1.780 | Max: 5.893 [LOSS Ex2] A: 0.17294 | B: 0.39546 | C: 0.27514 ** [JOINT LOSS] ** : 0.934643 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004590 | Grad Max: 0.120034 -> Layer: shared_layers.0.bias | Grad Mean: 0.267225 | Grad Max: 1.321267 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.006643 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004846 | Grad Max: 0.004846 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001698 | Grad Max: 0.257900 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031486 | Grad Max: 1.448287 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000285 | Grad Max: 0.008947 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014979 | Grad Max: 0.068971 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000482 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003292 | Grad Max: 0.008047 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000935 | Grad Max: 0.002373 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001099 | Grad Max: 0.002462 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021079 | Grad Max: 0.021079 [GRADIENT NORM TOTAL] 5.3768 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.569 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50995743 0.49004254] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 660/1388 | B: 557/1491 | C: 289/1759 [LOSS Ex1] A: 0.65441 | B: 0.65102 | C: 0.64850 [LOGITS Ex2 A] Mean Abs: 1.814 | Max: 6.168 [LOSS Ex2] A: 0.16168 | B: 0.37633 | C: 0.28547 ** [JOINT LOSS] ** : 0.925804 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003805 | Grad Max: 0.108565 -> Layer: shared_layers.0.bias | Grad Mean: 0.214512 | Grad Max: 0.964134 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.006625 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001721 | Grad Max: 0.001721 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001384 | Grad Max: 0.189326 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025592 | Grad Max: 1.030145 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.008496 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011962 | Grad Max: 0.060736 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000391 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002647 | Grad Max: 0.006341 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000162 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000768 | Grad Max: 0.001758 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000890 | Grad Max: 0.002761 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017431 | Grad Max: 0.017431 [GRADIENT NORM TOTAL] 4.3973 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.587 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022315 0.49776852] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 653/1395 | B: 514/1342 | C: 278/1770 [LOSS Ex1] A: 0.65342 | B: 0.65459 | C: 0.64859 [LOGITS Ex2 A] Mean Abs: 1.797 | Max: 6.628 [LOSS Ex2] A: 0.19436 | B: 0.36088 | C: 0.30498 ** [JOINT LOSS] ** : 0.938940 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006531 | Grad Max: 0.198765 -> Layer: shared_layers.0.bias | Grad Mean: 0.369131 | Grad Max: 1.593338 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.007165 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000329 | Grad Max: 0.000329 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002497 | Grad Max: 0.258218 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045909 | Grad Max: 1.447757 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000409 | Grad Max: 0.012028 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021223 | Grad Max: 0.092679 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000615 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004789 | Grad Max: 0.009741 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000303 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001394 | Grad Max: 0.003379 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001651 | Grad Max: 0.003096 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031659 | Grad Max: 0.031659 [GRADIENT NORM TOTAL] 7.4095 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.489 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041656 0.49583438] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.539 | Std: 0.048 [MASKS] A(Pass/Fail): 633/1415 | B: 545/1503 | C: 288/1760 [LOSS Ex1] A: 0.65988 | B: 0.65380 | C: 0.64681 [LOGITS Ex2 A] Mean Abs: 1.781 | Max: 5.850 [LOSS Ex2] A: 0.16469 | B: 0.38890 | C: 0.30645 ** [JOINT LOSS] ** : 0.940177 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003774 | Grad Max: 0.099501 -> Layer: shared_layers.0.bias | Grad Mean: 0.175529 | Grad Max: 0.828740 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.006074 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004666 | Grad Max: 0.004666 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001147 | Grad Max: 0.108907 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020803 | Grad Max: 0.598575 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000190 | Grad Max: 0.005523 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009675 | Grad Max: 0.044510 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000357 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002176 | Grad Max: 0.005668 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000147 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000636 | Grad Max: 0.001650 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000671 | Grad Max: 0.001771 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013352 | Grad Max: 0.013352 [GRADIENT NORM TOTAL] 3.3475 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.088 | Max: 0.418 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53531635 0.46468365] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.047 [MASKS] A(Pass/Fail): 621/1427 | B: 564/1484 | C: 271/1777 [LOSS Ex1] A: 0.65965 | B: 0.65454 | C: 0.64938 [LOGITS Ex2 A] Mean Abs: 1.709 | Max: 7.028 [LOSS Ex2] A: 0.17704 | B: 0.40155 | C: 0.29410 ** [JOINT LOSS] ** : 0.945420 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006334 | Grad Max: 0.148694 -> Layer: shared_layers.0.bias | Grad Mean: 0.439906 | Grad Max: 1.924764 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.006306 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007059 | Grad Max: 0.007059 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002772 | Grad Max: 0.307066 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051944 | Grad Max: 1.721029 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000495 | Grad Max: 0.016312 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025869 | Grad Max: 0.126429 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000718 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005709 | Grad Max: 0.011651 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000347 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001658 | Grad Max: 0.003989 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001971 | Grad Max: 0.003400 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037770 | Grad Max: 0.037770 [GRADIENT NORM TOTAL] 8.5671 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.553 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.66844493 0.33155513] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.049 [MASKS] A(Pass/Fail): 688/1360 | B: 557/1491 | C: 292/1756 [LOSS Ex1] A: 0.65521 | B: 0.65091 | C: 0.64860 [LOGITS Ex2 A] Mean Abs: 1.749 | Max: 6.225 [LOSS Ex2] A: 0.16226 | B: 0.38335 | C: 0.29748 ** [JOINT LOSS] ** : 0.932604 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006409 | Grad Max: 0.141052 -> Layer: shared_layers.0.bias | Grad Mean: 0.468551 | Grad Max: 1.977912 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002275 | Grad Max: 0.006857 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005920 | Grad Max: 0.005920 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002991 | Grad Max: 0.384782 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056039 | Grad Max: 2.176695 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000525 | Grad Max: 0.017409 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027559 | Grad Max: 0.138257 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000787 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006064 | Grad Max: 0.012445 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000382 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001750 | Grad Max: 0.004608 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002010 | Grad Max: 0.003763 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038764 | Grad Max: 0.038764 [GRADIENT NORM TOTAL] 9.2913 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.620 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003841 0.49961594] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.048 [MASKS] A(Pass/Fail): 665/1383 | B: 516/1340 | C: 186/1190 [LOSS Ex1] A: 0.66034 | B: 0.65448 | C: 0.64935 [LOGITS Ex2 A] Mean Abs: 1.779 | Max: 5.459 [LOSS Ex2] A: 0.16420 | B: 0.37163 | C: 0.26742 ** [JOINT LOSS] ** : 0.922477 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005492 | Grad Max: 0.183125 -> Layer: shared_layers.0.bias | Grad Mean: 0.254672 | Grad Max: 1.016137 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.006698 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008878 | Grad Max: 0.008878 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001629 | Grad Max: 0.279880 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029007 | Grad Max: 1.548691 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000261 | Grad Max: 0.007124 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013412 | Grad Max: 0.050487 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000430 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003081 | Grad Max: 0.007048 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000214 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000883 | Grad Max: 0.002495 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001019 | Grad Max: 0.002672 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018819 | Grad Max: 0.018819 [GRADIENT NORM TOTAL] 4.9684 [EPOCH SUMMARY] Train Loss: 0.9372 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9190 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9196 -> New: 0.9190) ############################## EPOCH 87/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.407 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.637204 0.36279604] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 649/1399 | B: 548/1500 | C: 275/1773 [LOSS Ex1] A: 0.65735 | B: 0.65369 | C: 0.65106 [LOGITS Ex2 A] Mean Abs: 1.808 | Max: 5.934 [LOSS Ex2] A: 0.17742 | B: 0.39195 | C: 0.30089 ** [JOINT LOSS] ** : 0.944119 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003559 | Grad Max: 0.096520 -> Layer: shared_layers.0.bias | Grad Mean: 0.308446 | Grad Max: 1.166812 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006632 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006358 | Grad Max: 0.006358 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001896 | Grad Max: 0.214410 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035460 | Grad Max: 1.199634 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000346 | Grad Max: 0.011507 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018364 | Grad Max: 0.091302 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000624 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003972 | Grad Max: 0.009543 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000255 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001159 | Grad Max: 0.002901 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001334 | Grad Max: 0.002713 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026652 | Grad Max: 0.026652 [GRADIENT NORM TOTAL] 5.8777 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.483 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58676064 0.4132394 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.049 [MASKS] A(Pass/Fail): 534/1082 | B: 565/1483 | C: 278/1770 [LOSS Ex1] A: 0.65596 | B: 0.65443 | C: 0.64945 [LOGITS Ex2 A] Mean Abs: 1.844 | Max: 6.454 [LOSS Ex2] A: 0.16110 | B: 0.38836 | C: 0.28640 ** [JOINT LOSS] ** : 0.931902 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004443 | Grad Max: 0.130053 -> Layer: shared_layers.0.bias | Grad Mean: 0.379303 | Grad Max: 1.766080 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.006710 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007912 | Grad Max: 0.007912 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002321 | Grad Max: 0.276676 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043796 | Grad Max: 1.516453 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000404 | Grad Max: 0.013634 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021201 | Grad Max: 0.098411 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000632 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004606 | Grad Max: 0.009896 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000260 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001333 | Grad Max: 0.003114 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001447 | Grad Max: 0.003540 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029557 | Grad Max: 0.029557 [GRADIENT NORM TOTAL] 7.4833 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.621 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50687194 0.4931281 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 666/1382 | B: 559/1489 | C: 290/1758 [LOSS Ex1] A: 0.65633 | B: 0.65079 | C: 0.64846 [LOGITS Ex2 A] Mean Abs: 1.814 | Max: 7.673 [LOSS Ex2] A: 0.16213 | B: 0.37191 | C: 0.28593 ** [JOINT LOSS] ** : 0.925181 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.070501 -> Layer: shared_layers.0.bias | Grad Mean: 0.173462 | Grad Max: 0.864734 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002267 | Grad Max: 0.006957 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006290 | Grad Max: 0.006290 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001120 | Grad Max: 0.158128 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020619 | Grad Max: 0.888754 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.007213 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008896 | Grad Max: 0.044972 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000308 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001943 | Grad Max: 0.004689 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000131 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000573 | Grad Max: 0.001472 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000602 | Grad Max: 0.002067 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012977 | Grad Max: 0.012977 [GRADIENT NORM TOTAL] 3.7620 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.572 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5100233 0.4899767] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 661/1387 | B: 516/1340 | C: 273/1775 [LOSS Ex1] A: 0.65420 | B: 0.65436 | C: 0.64916 [LOGITS Ex2 A] Mean Abs: 1.769 | Max: 5.252 [LOSS Ex2] A: 0.17023 | B: 0.37819 | C: 0.31117 ** [JOINT LOSS] ** : 0.939105 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003593 | Grad Max: 0.090492 -> Layer: shared_layers.0.bias | Grad Mean: 0.294026 | Grad Max: 1.261735 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002229 | Grad Max: 0.007348 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005804 | Grad Max: 0.005804 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001824 | Grad Max: 0.201028 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034216 | Grad Max: 1.127064 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000317 | Grad Max: 0.010836 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016813 | Grad Max: 0.086741 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000458 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003716 | Grad Max: 0.008084 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000245 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001078 | Grad Max: 0.002602 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001194 | Grad Max: 0.002265 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023834 | Grad Max: 0.023834 [GRADIENT NORM TOTAL] 5.6853 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.590 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50227284 0.49772716] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 654/1394 | B: 548/1500 | C: 291/1757 [LOSS Ex1] A: 0.65321 | B: 0.65357 | C: 0.64698 [LOGITS Ex2 A] Mean Abs: 1.745 | Max: 6.299 [LOSS Ex2] A: 0.18290 | B: 0.40866 | C: 0.27180 ** [JOINT LOSS] ** : 0.939041 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004799 | Grad Max: 0.137791 -> Layer: shared_layers.0.bias | Grad Mean: 0.435849 | Grad Max: 1.968644 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002359 | Grad Max: 0.007715 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009492 | Grad Max: 0.009492 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002649 | Grad Max: 0.338881 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049730 | Grad Max: 1.883790 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.012680 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023729 | Grad Max: 0.107616 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000744 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005179 | Grad Max: 0.011471 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000297 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001488 | Grad Max: 0.003604 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001716 | Grad Max: 0.003135 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033219 | Grad Max: 0.033219 [GRADIENT NORM TOTAL] 8.6420 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.491 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50416994 0.49583006] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 634/1414 | B: 565/1483 | C: 303/1745 [LOSS Ex1] A: 0.65971 | B: 0.65431 | C: 0.64660 [LOGITS Ex2 A] Mean Abs: 1.751 | Max: 5.897 [LOSS Ex2] A: 0.15760 | B: 0.39136 | C: 0.29270 ** [JOINT LOSS] ** : 0.934095 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.065099 -> Layer: shared_layers.0.bias | Grad Mean: 0.177111 | Grad Max: 0.818712 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002060 | Grad Max: 0.006543 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005871 | Grad Max: 0.005871 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001158 | Grad Max: 0.260585 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021575 | Grad Max: 1.456546 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000201 | Grad Max: 0.007524 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010544 | Grad Max: 0.052393 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000343 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002289 | Grad Max: 0.005243 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000144 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000672 | Grad Max: 0.001586 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000838 | Grad Max: 0.002104 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015912 | Grad Max: 0.015912 [GRADIENT NORM TOTAL] 3.8582 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.421 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5354926 0.46450746] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.047 [MASKS] A(Pass/Fail): 621/1427 | B: 561/1487 | C: 267/1781 [LOSS Ex1] A: 0.65948 | B: 0.65066 | C: 0.64969 [LOGITS Ex2 A] Mean Abs: 1.772 | Max: 6.067 [LOSS Ex2] A: 0.17596 | B: 0.37378 | C: 0.29008 ** [JOINT LOSS] ** : 0.933217 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005486 | Grad Max: 0.124556 -> Layer: shared_layers.0.bias | Grad Mean: 0.353670 | Grad Max: 1.541149 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.006160 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004101 | Grad Max: 0.004101 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002282 | Grad Max: 0.239072 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042522 | Grad Max: 1.349183 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.012806 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021003 | Grad Max: 0.100265 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000588 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004615 | Grad Max: 0.009643 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000274 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001341 | Grad Max: 0.003067 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001584 | Grad Max: 0.003404 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030757 | Grad Max: 0.030757 [GRADIENT NORM TOTAL] 6.9118 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.556 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6695137 0.33048636] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.049 [MASKS] A(Pass/Fail): 690/1358 | B: 516/1340 | C: 279/1769 [LOSS Ex1] A: 0.65503 | B: 0.65424 | C: 0.64839 [LOGITS Ex2 A] Mean Abs: 1.813 | Max: 5.721 [LOSS Ex2] A: 0.17522 | B: 0.36961 | C: 0.30906 ** [JOINT LOSS] ** : 0.937182 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008513 | Grad Max: 0.217738 -> Layer: shared_layers.0.bias | Grad Mean: 0.522860 | Grad Max: 2.324744 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.006747 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003171 | Grad Max: 0.003171 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003417 | Grad Max: 0.339107 -> Layer: exit2_layers.0.bias | Grad Mean: 0.063679 | Grad Max: 1.912306 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000602 | Grad Max: 0.018664 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031658 | Grad Max: 0.148125 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000852 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007024 | Grad Max: 0.014215 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000365 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002052 | Grad Max: 0.004516 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002461 | Grad Max: 0.004801 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046732 | Grad Max: 0.046732 [GRADIENT NORM TOTAL] 10.1643 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.624 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004 0.4996] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 666/1382 | B: 548/1500 | C: 295/1753 [LOSS Ex1] A: 0.66017 | B: 0.65345 | C: 0.64648 [LOGITS Ex2 A] Mean Abs: 1.803 | Max: 5.461 [LOSS Ex2] A: 0.16647 | B: 0.38752 | C: 0.29881 ** [JOINT LOSS] ** : 0.937636 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004660 | Grad Max: 0.095340 -> Layer: shared_layers.0.bias | Grad Mean: 0.279307 | Grad Max: 1.260947 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.005858 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002755 | Grad Max: 0.002755 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001859 | Grad Max: 0.203356 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034720 | Grad Max: 1.102229 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000342 | Grad Max: 0.010504 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017815 | Grad Max: 0.089640 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000511 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003889 | Grad Max: 0.008303 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000222 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001122 | Grad Max: 0.002465 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001258 | Grad Max: 0.002739 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024513 | Grad Max: 0.024513 [GRADIENT NORM TOTAL] 5.5065 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.410 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6380608 0.36193916] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 652/1396 | B: 565/1483 | C: 292/1756 [LOSS Ex1] A: 0.65716 | B: 0.65420 | C: 0.64826 [LOGITS Ex2 A] Mean Abs: 1.748 | Max: 5.976 [LOSS Ex2] A: 0.17770 | B: 0.40218 | C: 0.28812 ** [JOINT LOSS] ** : 0.942539 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005468 | Grad Max: 0.134870 -> Layer: shared_layers.0.bias | Grad Mean: 0.341299 | Grad Max: 1.506555 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.006175 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000246 | Grad Max: 0.000246 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002238 | Grad Max: 0.238635 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041495 | Grad Max: 1.346310 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000393 | Grad Max: 0.012202 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020600 | Grad Max: 0.101092 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000594 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004586 | Grad Max: 0.009353 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000253 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001350 | Grad Max: 0.003033 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001666 | Grad Max: 0.003016 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031322 | Grad Max: 0.031322 [GRADIENT NORM TOTAL] 6.7023 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.485 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5873356 0.4126644] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.049 [MASKS] A(Pass/Fail): 535/1081 | B: 561/1487 | C: 282/1766 [LOSS Ex1] A: 0.65576 | B: 0.65054 | C: 0.64664 [LOGITS Ex2 A] Mean Abs: 1.763 | Max: 6.308 [LOSS Ex2] A: 0.17642 | B: 0.39394 | C: 0.30397 ** [JOINT LOSS] ** : 0.942427 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008920 | Grad Max: 0.227429 -> Layer: shared_layers.0.bias | Grad Mean: 0.542684 | Grad Max: 2.246858 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002315 | Grad Max: 0.007084 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007449 | Grad Max: 0.007449 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003537 | Grad Max: 0.389215 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065893 | Grad Max: 2.121088 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.018073 -> Layer: exit2_layers.3.bias | Grad Mean: 0.032201 | Grad Max: 0.151392 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000961 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007195 | Grad Max: 0.015220 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000439 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002099 | Grad Max: 0.004959 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002559 | Grad Max: 0.004951 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048121 | Grad Max: 0.048121 [GRADIENT NORM TOTAL] 10.6579 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.625 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068829 0.49311706] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 670/1378 | B: 516/1340 | C: 267/1781 [LOSS Ex1] A: 0.65614 | B: 0.65413 | C: 0.65117 [LOGITS Ex2 A] Mean Abs: 1.771 | Max: 6.126 [LOSS Ex2] A: 0.17012 | B: 0.36879 | C: 0.30820 ** [JOINT LOSS] ** : 0.936186 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007255 | Grad Max: 0.200666 -> Layer: shared_layers.0.bias | Grad Mean: 0.326862 | Grad Max: 1.365790 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002247 | Grad Max: 0.007333 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011881 | Grad Max: 0.011881 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002237 | Grad Max: 0.278151 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041240 | Grad Max: 1.488259 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000379 | Grad Max: 0.011959 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019646 | Grad Max: 0.086279 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000585 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004427 | Grad Max: 0.009089 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000296 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001280 | Grad Max: 0.003381 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001527 | Grad Max: 0.003131 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028702 | Grad Max: 0.028702 [GRADIENT NORM TOTAL] 6.5351 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.576 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51012135 0.48987868] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.049 [MASKS] A(Pass/Fail): 661/1387 | B: 548/1500 | C: 268/1780 [LOSS Ex1] A: 0.65400 | B: 0.65334 | C: 0.65054 [LOGITS Ex2 A] Mean Abs: 1.804 | Max: 5.776 [LOSS Ex2] A: 0.16547 | B: 0.38978 | C: 0.30215 ** [JOINT LOSS] ** : 0.938429 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002877 | Grad Max: 0.081363 -> Layer: shared_layers.0.bias | Grad Mean: 0.189537 | Grad Max: 0.731566 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006795 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001722 | Grad Max: 0.001722 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001203 | Grad Max: 0.145758 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022416 | Grad Max: 0.813338 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000194 | Grad Max: 0.007963 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010252 | Grad Max: 0.057142 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000375 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002240 | Grad Max: 0.005882 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000162 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000636 | Grad Max: 0.001913 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000647 | Grad Max: 0.001564 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013012 | Grad Max: 0.013012 [GRADIENT NORM TOTAL] 3.7778 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.594 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022637 0.49773625] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 655/1393 | B: 565/1483 | C: 170/1206 [LOSS Ex1] A: 0.65301 | B: 0.65409 | C: 0.65303 [LOGITS Ex2 A] Mean Abs: 1.781 | Max: 6.184 [LOSS Ex2] A: 0.18642 | B: 0.38807 | C: 0.29124 ** [JOINT LOSS] ** : 0.941951 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005111 | Grad Max: 0.161090 -> Layer: shared_layers.0.bias | Grad Mean: 0.325026 | Grad Max: 1.468766 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.006322 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002456 | Grad Max: 0.002456 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.231702 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040107 | Grad Max: 1.289719 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.011232 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018731 | Grad Max: 0.084171 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000566 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004192 | Grad Max: 0.009061 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000256 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001220 | Grad Max: 0.002960 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001392 | Grad Max: 0.003004 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027080 | Grad Max: 0.027080 [GRADIENT NORM TOTAL] 6.5852 [EPOCH SUMMARY] Train Loss: 0.9374 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9143 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9190 -> New: 0.9143) ############################## EPOCH 88/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.494 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50413597 0.49586403] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.049 [MASKS] A(Pass/Fail): 634/1414 | B: 562/1486 | C: 297/1751 [LOSS Ex1] A: 0.65954 | B: 0.65042 | C: 0.64710 [LOGITS Ex2 A] Mean Abs: 1.759 | Max: 6.446 [LOSS Ex2] A: 0.16386 | B: 0.36679 | C: 0.28866 ** [JOINT LOSS] ** : 0.925460 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003519 | Grad Max: 0.088640 -> Layer: shared_layers.0.bias | Grad Mean: 0.214625 | Grad Max: 0.993426 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002196 | Grad Max: 0.006277 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004747 | Grad Max: 0.004747 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001371 | Grad Max: 0.163329 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025114 | Grad Max: 0.924035 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000230 | Grad Max: 0.006692 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012113 | Grad Max: 0.054974 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000441 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002691 | Grad Max: 0.006190 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000169 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000798 | Grad Max: 0.001766 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000917 | Grad Max: 0.002763 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018604 | Grad Max: 0.018604 [GRADIENT NORM TOTAL] 4.3379 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.089 | Max: 0.424 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5357268 0.46427318] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.539 | Std: 0.047 [MASKS] A(Pass/Fail): 623/1425 | B: 516/1340 | C: 252/1796 [LOSS Ex1] A: 0.65931 | B: 0.65402 | C: 0.65026 [LOGITS Ex2 A] Mean Abs: 1.687 | Max: 6.797 [LOSS Ex2] A: 0.18148 | B: 0.37605 | C: 0.29146 ** [JOINT LOSS] ** : 0.937525 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004688 | Grad Max: 0.099575 -> Layer: shared_layers.0.bias | Grad Mean: 0.361064 | Grad Max: 1.458034 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.006445 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000953 | Grad Max: 0.000953 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.286295 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042041 | Grad Max: 1.616733 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.010292 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020465 | Grad Max: 0.088120 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000571 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004489 | Grad Max: 0.009130 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000270 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001299 | Grad Max: 0.003112 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001507 | Grad Max: 0.002935 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028797 | Grad Max: 0.028797 [GRADIENT NORM TOTAL] 7.0741 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.559 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.67064774 0.3293523 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.050 [MASKS] A(Pass/Fail): 690/1358 | B: 549/1499 | C: 281/1767 [LOSS Ex1] A: 0.65483 | B: 0.65323 | C: 0.64981 [LOGITS Ex2 A] Mean Abs: 1.732 | Max: 6.792 [LOSS Ex2] A: 0.16240 | B: 0.40955 | C: 0.32043 ** [JOINT LOSS] ** : 0.950077 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004741 | Grad Max: 0.147247 -> Layer: shared_layers.0.bias | Grad Mean: 0.448967 | Grad Max: 1.945120 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.006882 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006011 | Grad Max: 0.006011 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002800 | Grad Max: 0.387232 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052805 | Grad Max: 2.158943 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000484 | Grad Max: 0.016521 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025842 | Grad Max: 0.136008 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000712 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005641 | Grad Max: 0.011705 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000333 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001650 | Grad Max: 0.003967 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001904 | Grad Max: 0.003809 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037689 | Grad Max: 0.037689 [GRADIENT NORM TOTAL] 9.1317 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.627 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004329 0.49956706] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 666/1382 | B: 565/1483 | C: 298/1750 [LOSS Ex1] A: 0.66000 | B: 0.65397 | C: 0.64641 [LOGITS Ex2 A] Mean Abs: 1.753 | Max: 5.523 [LOSS Ex2] A: 0.15736 | B: 0.38793 | C: 0.30467 ** [JOINT LOSS] ** : 0.936781 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003939 | Grad Max: 0.105045 -> Layer: shared_layers.0.bias | Grad Mean: 0.240508 | Grad Max: 0.977082 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.006120 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003783 | Grad Max: 0.003783 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001491 | Grad Max: 0.199540 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027716 | Grad Max: 1.110599 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000277 | Grad Max: 0.008911 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014582 | Grad Max: 0.066811 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000430 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003237 | Grad Max: 0.006753 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000254 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000956 | Grad Max: 0.002677 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001188 | Grad Max: 0.002725 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022506 | Grad Max: 0.022506 [GRADIENT NORM TOTAL] 4.4978 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.413 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63886786 0.36113217] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.050 [MASKS] A(Pass/Fail): 653/1395 | B: 562/1486 | C: 270/1778 [LOSS Ex1] A: 0.65698 | B: 0.65030 | C: 0.64864 [LOGITS Ex2 A] Mean Abs: 1.786 | Max: 5.612 [LOSS Ex2] A: 0.17813 | B: 0.37216 | C: 0.29210 ** [JOINT LOSS] ** : 0.932772 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003829 | Grad Max: 0.121108 -> Layer: shared_layers.0.bias | Grad Mean: 0.315441 | Grad Max: 1.454433 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.006318 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005170 | Grad Max: 0.005170 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.240766 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038252 | Grad Max: 1.346906 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000347 | Grad Max: 0.011724 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018478 | Grad Max: 0.090099 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000572 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004080 | Grad Max: 0.008662 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000245 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001190 | Grad Max: 0.002668 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001341 | Grad Max: 0.003236 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026939 | Grad Max: 0.026939 [GRADIENT NORM TOTAL] 6.6164 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.488 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58776844 0.4122315 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 535/1081 | B: 516/1340 | C: 282/1766 [LOSS Ex1] A: 0.65557 | B: 0.65391 | C: 0.64939 [LOGITS Ex2 A] Mean Abs: 1.835 | Max: 5.486 [LOSS Ex2] A: 0.16911 | B: 0.37494 | C: 0.28732 ** [JOINT LOSS] ** : 0.930078 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006982 | Grad Max: 0.182644 -> Layer: shared_layers.0.bias | Grad Mean: 0.508850 | Grad Max: 2.282396 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.006154 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003739 | Grad Max: 0.003739 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003271 | Grad Max: 0.320944 -> Layer: exit2_layers.0.bias | Grad Mean: 0.061010 | Grad Max: 1.822636 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000568 | Grad Max: 0.016976 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030092 | Grad Max: 0.140125 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000830 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006661 | Grad Max: 0.014046 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000365 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001948 | Grad Max: 0.004428 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002188 | Grad Max: 0.004567 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043909 | Grad Max: 0.043909 [GRADIENT NORM TOTAL] 10.0762 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.628 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50687426 0.4931257 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 671/1377 | B: 549/1499 | C: 306/1742 [LOSS Ex1] A: 0.65595 | B: 0.65312 | C: 0.64415 [LOGITS Ex2 A] Mean Abs: 1.802 | Max: 7.064 [LOSS Ex2] A: 0.16285 | B: 0.39494 | C: 0.28246 ** [JOINT LOSS] ** : 0.931154 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005460 | Grad Max: 0.137292 -> Layer: shared_layers.0.bias | Grad Mean: 0.379656 | Grad Max: 1.679972 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002215 | Grad Max: 0.006532 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001494 | Grad Max: 0.001494 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002406 | Grad Max: 0.242965 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045062 | Grad Max: 1.347069 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.012201 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022797 | Grad Max: 0.106665 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000694 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005066 | Grad Max: 0.011175 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000269 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001499 | Grad Max: 0.003319 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001714 | Grad Max: 0.003835 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034480 | Grad Max: 0.034480 [GRADIENT NORM TOTAL] 7.3393 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.579 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5101537 0.48984632] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 662/1386 | B: 565/1483 | C: 280/1768 [LOSS Ex1] A: 0.65379 | B: 0.65387 | C: 0.64962 [LOGITS Ex2 A] Mean Abs: 1.758 | Max: 5.864 [LOSS Ex2] A: 0.16135 | B: 0.39176 | C: 0.28328 ** [JOINT LOSS] ** : 0.931222 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001968 | Grad Max: 0.037427 -> Layer: shared_layers.0.bias | Grad Mean: 0.134070 | Grad Max: 0.608085 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002324 | Grad Max: 0.007373 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012715 | Grad Max: 0.012715 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000888 | Grad Max: 0.230786 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016432 | Grad Max: 1.297672 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.006254 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006855 | Grad Max: 0.039363 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000287 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001461 | Grad Max: 0.003995 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000125 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000408 | Grad Max: 0.001259 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000519 | Grad Max: 0.001753 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008697 | Grad Max: 0.008697 [GRADIENT NORM TOTAL] 3.0670 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.597 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5022763 0.49772373] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 655/1393 | B: 562/1486 | C: 295/1753 [LOSS Ex1] A: 0.65280 | B: 0.65018 | C: 0.64545 [LOGITS Ex2 A] Mean Abs: 1.728 | Max: 7.048 [LOSS Ex2] A: 0.18527 | B: 0.38772 | C: 0.30128 ** [JOINT LOSS] ** : 0.940904 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004676 | Grad Max: 0.117181 -> Layer: shared_layers.0.bias | Grad Mean: 0.340410 | Grad Max: 1.443603 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002379 | Grad Max: 0.007232 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001298 | Grad Max: 0.001298 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002075 | Grad Max: 0.234378 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039086 | Grad Max: 1.331246 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000371 | Grad Max: 0.012858 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019713 | Grad Max: 0.104805 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000560 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004284 | Grad Max: 0.008849 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000267 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001239 | Grad Max: 0.002907 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001443 | Grad Max: 0.002687 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027674 | Grad Max: 0.027674 [GRADIENT NORM TOTAL] 6.4536 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.497 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50411165 0.49588835] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.049 [MASKS] A(Pass/Fail): 636/1412 | B: 516/1340 | C: 290/1758 [LOSS Ex1] A: 0.65936 | B: 0.65379 | C: 0.64717 [LOGITS Ex2 A] Mean Abs: 1.726 | Max: 5.638 [LOSS Ex2] A: 0.16433 | B: 0.36436 | C: 0.25821 ** [JOINT LOSS] ** : 0.915744 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003761 | Grad Max: 0.125368 -> Layer: shared_layers.0.bias | Grad Mean: 0.193989 | Grad Max: 0.781740 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005920 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005297 | Grad Max: 0.005297 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001352 | Grad Max: 0.155480 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024765 | Grad Max: 0.867581 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000231 | Grad Max: 0.006785 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012166 | Grad Max: 0.049139 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000420 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002686 | Grad Max: 0.006568 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000205 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000756 | Grad Max: 0.002129 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000866 | Grad Max: 0.002695 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015402 | Grad Max: 0.015402 [GRADIENT NORM TOTAL] 3.8746 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.427 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5358893 0.4641107] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 624/1424 | B: 550/1498 | C: 262/1786 [LOSS Ex1] A: 0.65913 | B: 0.65300 | C: 0.65212 [LOGITS Ex2 A] Mean Abs: 1.763 | Max: 6.334 [LOSS Ex2] A: 0.17689 | B: 0.39156 | C: 0.32236 ** [JOINT LOSS] ** : 0.951688 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005359 | Grad Max: 0.137401 -> Layer: shared_layers.0.bias | Grad Mean: 0.346765 | Grad Max: 1.530022 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002016 | Grad Max: 0.005916 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004178 | Grad Max: 0.004178 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.273512 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041928 | Grad Max: 1.478631 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000390 | Grad Max: 0.012854 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020717 | Grad Max: 0.103062 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000621 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004585 | Grad Max: 0.010031 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000263 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001336 | Grad Max: 0.003247 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001483 | Grad Max: 0.003062 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029097 | Grad Max: 0.029097 [GRADIENT NORM TOTAL] 6.8930 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.562 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.67175174 0.32824826] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.050 [MASKS] A(Pass/Fail): 691/1357 | B: 565/1483 | C: 266/1782 [LOSS Ex1] A: 0.65463 | B: 0.65374 | C: 0.65177 [LOGITS Ex2 A] Mean Abs: 1.805 | Max: 6.222 [LOSS Ex2] A: 0.17335 | B: 0.39492 | C: 0.29689 ** [JOINT LOSS] ** : 0.941770 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007887 | Grad Max: 0.195482 -> Layer: shared_layers.0.bias | Grad Mean: 0.485248 | Grad Max: 1.911145 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.007003 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005712 | Grad Max: 0.005712 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003086 | Grad Max: 0.386377 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057230 | Grad Max: 2.155375 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000548 | Grad Max: 0.016273 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028924 | Grad Max: 0.135086 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000833 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006425 | Grad Max: 0.013326 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000363 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001867 | Grad Max: 0.004274 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002147 | Grad Max: 0.004164 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041356 | Grad Max: 0.041356 [GRADIENT NORM TOTAL] 9.4567 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.630 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004511 0.49954888] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 668/1380 | B: 562/1486 | C: 270/1778 [LOSS Ex1] A: 0.65982 | B: 0.65006 | C: 0.64927 [LOGITS Ex2 A] Mean Abs: 1.787 | Max: 5.549 [LOSS Ex2] A: 0.16262 | B: 0.36887 | C: 0.30319 ** [JOINT LOSS] ** : 0.931278 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002754 | Grad Max: 0.072637 -> Layer: shared_layers.0.bias | Grad Mean: 0.232772 | Grad Max: 0.995903 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005831 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000160 | Grad Max: 0.000160 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001483 | Grad Max: 0.180704 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027693 | Grad Max: 1.006372 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.008853 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013686 | Grad Max: 0.072554 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000407 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002901 | Grad Max: 0.006488 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000201 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000825 | Grad Max: 0.002063 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000873 | Grad Max: 0.002324 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017533 | Grad Max: 0.017533 [GRADIENT NORM TOTAL] 4.7181 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.416 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6397509 0.36024916] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.050 [MASKS] A(Pass/Fail): 653/1395 | B: 516/1340 | C: 191/1185 [LOSS Ex1] A: 0.65679 | B: 0.65367 | C: 0.64647 [LOGITS Ex2 A] Mean Abs: 1.743 | Max: 5.662 [LOSS Ex2] A: 0.17501 | B: 0.35933 | C: 0.26676 ** [JOINT LOSS] ** : 0.919342 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005578 | Grad Max: 0.143736 -> Layer: shared_layers.0.bias | Grad Mean: 0.359004 | Grad Max: 1.523907 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.006163 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002703 | Grad Max: 0.002703 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.272056 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041510 | Grad Max: 1.506926 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000385 | Grad Max: 0.012969 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020292 | Grad Max: 0.094658 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000672 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004520 | Grad Max: 0.010558 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000279 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001286 | Grad Max: 0.003220 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001371 | Grad Max: 0.003098 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026696 | Grad Max: 0.026696 [GRADIENT NORM TOTAL] 6.8379 [EPOCH SUMMARY] Train Loss: 0.9340 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9240 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 89/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.491 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58833754 0.41166246] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.050 [MASKS] A(Pass/Fail): 536/1080 | B: 551/1497 | C: 284/1764 [LOSS Ex1] A: 0.65537 | B: 0.65288 | C: 0.64650 [LOGITS Ex2 A] Mean Abs: 1.780 | Max: 6.255 [LOSS Ex2] A: 0.16210 | B: 0.40528 | C: 0.28074 ** [JOINT LOSS] ** : 0.934292 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007708 | Grad Max: 0.180835 -> Layer: shared_layers.0.bias | Grad Mean: 0.540948 | Grad Max: 2.347214 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.006916 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006157 | Grad Max: 0.006157 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003491 | Grad Max: 0.423662 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065319 | Grad Max: 2.346340 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000614 | Grad Max: 0.019192 -> Layer: exit2_layers.3.bias | Grad Mean: 0.032584 | Grad Max: 0.157733 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000939 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007204 | Grad Max: 0.015549 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000414 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002118 | Grad Max: 0.004799 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002576 | Grad Max: 0.004473 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048624 | Grad Max: 0.048624 [GRADIENT NORM TOTAL] 10.8141 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.632 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068857 0.49311423] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 676/1372 | B: 565/1483 | C: 303/1745 [LOSS Ex1] A: 0.65576 | B: 0.65363 | C: 0.64508 [LOGITS Ex2 A] Mean Abs: 1.777 | Max: 6.566 [LOSS Ex2] A: 0.16049 | B: 0.38425 | C: 0.29052 ** [JOINT LOSS] ** : 0.929910 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005261 | Grad Max: 0.162287 -> Layer: shared_layers.0.bias | Grad Mean: 0.268404 | Grad Max: 1.307724 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002220 | Grad Max: 0.006445 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001417 | Grad Max: 0.001417 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001780 | Grad Max: 0.296751 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032514 | Grad Max: 1.648130 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000289 | Grad Max: 0.008801 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015222 | Grad Max: 0.069508 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000505 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003419 | Grad Max: 0.007643 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000227 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001000 | Grad Max: 0.002387 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001188 | Grad Max: 0.002460 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022507 | Grad Max: 0.022507 [GRADIENT NORM TOTAL] 5.7263 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.583 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5102206 0.48977938] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 662/1386 | B: 562/1486 | C: 274/1774 [LOSS Ex1] A: 0.65358 | B: 0.64994 | C: 0.64926 [LOGITS Ex2 A] Mean Abs: 1.809 | Max: 5.401 [LOSS Ex2] A: 0.16821 | B: 0.37383 | C: 0.31233 ** [JOINT LOSS] ** : 0.935718 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003718 | Grad Max: 0.108989 -> Layer: shared_layers.0.bias | Grad Mean: 0.225723 | Grad Max: 1.066452 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002292 | Grad Max: 0.006257 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004404 | Grad Max: 0.004404 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001479 | Grad Max: 0.177295 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027389 | Grad Max: 0.958216 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000228 | Grad Max: 0.007472 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012136 | Grad Max: 0.057174 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000382 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002653 | Grad Max: 0.006193 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000180 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000764 | Grad Max: 0.001720 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000860 | Grad Max: 0.001906 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016464 | Grad Max: 0.016464 [GRADIENT NORM TOTAL] 4.6244 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.601 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50229526 0.49770477] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 655/1393 | B: 516/1340 | C: 262/1786 [LOSS Ex1] A: 0.65259 | B: 0.65357 | C: 0.64951 [LOGITS Ex2 A] Mean Abs: 1.798 | Max: 6.552 [LOSS Ex2] A: 0.19418 | B: 0.35712 | C: 0.30117 ** [JOINT LOSS] ** : 0.936050 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007357 | Grad Max: 0.239239 -> Layer: shared_layers.0.bias | Grad Mean: 0.364584 | Grad Max: 1.490523 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002200 | Grad Max: 0.007132 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003510 | Grad Max: 0.003510 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002579 | Grad Max: 0.251047 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047050 | Grad Max: 1.421221 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000438 | Grad Max: 0.013202 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022957 | Grad Max: 0.100593 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000658 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005192 | Grad Max: 0.010654 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000315 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001522 | Grad Max: 0.003714 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001793 | Grad Max: 0.003720 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034316 | Grad Max: 0.034316 [GRADIENT NORM TOTAL] 7.3326 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.500 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50410783 0.49589217] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.049 [MASKS] A(Pass/Fail): 637/1411 | B: 551/1497 | C: 264/1784 [LOSS Ex1] A: 0.65918 | B: 0.65277 | C: 0.65008 [LOGITS Ex2 A] Mean Abs: 1.759 | Max: 5.241 [LOSS Ex2] A: 0.16320 | B: 0.38386 | C: 0.29659 ** [JOINT LOSS] ** : 0.935228 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003558 | Grad Max: 0.102268 -> Layer: shared_layers.0.bias | Grad Mean: 0.126095 | Grad Max: 0.515407 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005923 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000333 | Grad Max: 0.000333 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000892 | Grad Max: 0.105907 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015735 | Grad Max: 0.594537 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000139 | Grad Max: 0.004896 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007192 | Grad Max: 0.029379 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000308 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001673 | Grad Max: 0.004409 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000146 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000479 | Grad Max: 0.001476 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000519 | Grad Max: 0.001927 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010289 | Grad Max: 0.010289 [GRADIENT NORM TOTAL] 2.5176 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.429 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5360387 0.4639613] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 627/1421 | B: 565/1483 | C: 292/1756 [LOSS Ex1] A: 0.65895 | B: 0.65352 | C: 0.64814 [LOGITS Ex2 A] Mean Abs: 1.698 | Max: 6.273 [LOSS Ex2] A: 0.17750 | B: 0.40434 | C: 0.28563 ** [JOINT LOSS] ** : 0.942691 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006611 | Grad Max: 0.189470 -> Layer: shared_layers.0.bias | Grad Mean: 0.455846 | Grad Max: 1.972015 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002037 | Grad Max: 0.006430 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005476 | Grad Max: 0.005476 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002872 | Grad Max: 0.319782 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053902 | Grad Max: 1.748670 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000490 | Grad Max: 0.013598 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026048 | Grad Max: 0.116962 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000672 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005757 | Grad Max: 0.011844 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000347 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001679 | Grad Max: 0.004190 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001963 | Grad Max: 0.003587 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037580 | Grad Max: 0.037580 [GRADIENT NORM TOTAL] 8.8648 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.565 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6728455 0.32715458] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.050 [MASKS] A(Pass/Fail): 692/1356 | B: 563/1485 | C: 286/1762 [LOSS Ex1] A: 0.65443 | B: 0.64982 | C: 0.64606 [LOGITS Ex2 A] Mean Abs: 1.730 | Max: 6.385 [LOSS Ex2] A: 0.16184 | B: 0.38960 | C: 0.29822 ** [JOINT LOSS] ** : 0.933320 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006211 | Grad Max: 0.161845 -> Layer: shared_layers.0.bias | Grad Mean: 0.515156 | Grad Max: 2.238544 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002343 | Grad Max: 0.007341 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008466 | Grad Max: 0.008466 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003221 | Grad Max: 0.378496 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060910 | Grad Max: 2.118878 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000555 | Grad Max: 0.018269 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029736 | Grad Max: 0.151278 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000878 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006528 | Grad Max: 0.014313 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000375 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001904 | Grad Max: 0.004163 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002230 | Grad Max: 0.004024 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042457 | Grad Max: 0.042457 [GRADIENT NORM TOTAL] 10.2571 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.634 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004217 0.49957833] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 668/1380 | B: 517/1339 | C: 276/1772 [LOSS Ex1] A: 0.65964 | B: 0.65345 | C: 0.64794 [LOGITS Ex2 A] Mean Abs: 1.745 | Max: 5.429 [LOSS Ex2] A: 0.16552 | B: 0.36623 | C: 0.27994 ** [JOINT LOSS] ** : 0.924241 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006327 | Grad Max: 0.179407 -> Layer: shared_layers.0.bias | Grad Mean: 0.369146 | Grad Max: 1.552226 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.006021 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000680 | Grad Max: 0.000680 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002446 | Grad Max: 0.289744 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045210 | Grad Max: 1.612117 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000418 | Grad Max: 0.012052 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022203 | Grad Max: 0.104997 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000579 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004975 | Grad Max: 0.010512 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000297 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001441 | Grad Max: 0.003341 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001614 | Grad Max: 0.003457 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031305 | Grad Max: 0.031305 [GRADIENT NORM TOTAL] 7.2950 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.419 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64049816 0.3595018 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.050 [MASKS] A(Pass/Fail): 653/1395 | B: 551/1497 | C: 291/1757 [LOSS Ex1] A: 0.65659 | B: 0.65266 | C: 0.64820 [LOGITS Ex2 A] Mean Abs: 1.786 | Max: 5.423 [LOSS Ex2] A: 0.17428 | B: 0.38755 | C: 0.28885 ** [JOINT LOSS] ** : 0.936042 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002637 | Grad Max: 0.083023 -> Layer: shared_layers.0.bias | Grad Mean: 0.203529 | Grad Max: 0.979730 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.006228 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000059 | Grad Max: 0.000059 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001327 | Grad Max: 0.149643 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024247 | Grad Max: 0.840486 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000233 | Grad Max: 0.009172 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012307 | Grad Max: 0.072153 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000355 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002671 | Grad Max: 0.006426 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000164 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000773 | Grad Max: 0.001893 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000830 | Grad Max: 0.002121 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016893 | Grad Max: 0.016893 [GRADIENT NORM TOTAL] 4.1577 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.494 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58872885 0.41127118] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.050 [MASKS] A(Pass/Fail): 536/1080 | B: 565/1483 | C: 308/1740 [LOSS Ex1] A: 0.65517 | B: 0.65341 | C: 0.64655 [LOGITS Ex2 A] Mean Abs: 1.853 | Max: 5.929 [LOSS Ex2] A: 0.16426 | B: 0.39384 | C: 0.30217 ** [JOINT LOSS] ** : 0.938467 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002924 | Grad Max: 0.101850 -> Layer: shared_layers.0.bias | Grad Mean: 0.296639 | Grad Max: 1.358741 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.007134 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006572 | Grad Max: 0.006572 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001865 | Grad Max: 0.226990 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034861 | Grad Max: 1.284668 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000318 | Grad Max: 0.010935 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016990 | Grad Max: 0.083918 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000528 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003686 | Grad Max: 0.008070 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000225 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001059 | Grad Max: 0.002551 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001122 | Grad Max: 0.002615 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022607 | Grad Max: 0.022607 [GRADIENT NORM TOTAL] 6.1741 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.635 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50692725 0.49307272] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 676/1372 | B: 569/1479 | C: 274/1774 [LOSS Ex1] A: 0.65556 | B: 0.64970 | C: 0.64940 [LOGITS Ex2 A] Mean Abs: 1.812 | Max: 7.043 [LOSS Ex2] A: 0.16341 | B: 0.36356 | C: 0.29296 ** [JOINT LOSS] ** : 0.924861 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002577 | Grad Max: 0.102322 -> Layer: shared_layers.0.bias | Grad Mean: 0.209497 | Grad Max: 1.063770 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002282 | Grad Max: 0.006938 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008314 | Grad Max: 0.008314 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001334 | Grad Max: 0.162416 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024295 | Grad Max: 0.905910 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000212 | Grad Max: 0.008167 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011216 | Grad Max: 0.061102 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000364 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002368 | Grad Max: 0.005306 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000151 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000695 | Grad Max: 0.001667 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000734 | Grad Max: 0.002261 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015941 | Grad Max: 0.015941 [GRADIENT NORM TOTAL] 4.3274 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.586 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51020205 0.489798 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 663/1385 | B: 524/1332 | C: 269/1779 [LOSS Ex1] A: 0.65337 | B: 0.65334 | C: 0.64923 [LOGITS Ex2 A] Mean Abs: 1.773 | Max: 5.707 [LOSS Ex2] A: 0.16775 | B: 0.37458 | C: 0.29446 ** [JOINT LOSS] ** : 0.930908 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004765 | Grad Max: 0.115176 -> Layer: shared_layers.0.bias | Grad Mean: 0.370177 | Grad Max: 1.641537 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006922 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006638 | Grad Max: 0.006638 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002386 | Grad Max: 0.276886 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044732 | Grad Max: 1.554258 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.013308 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021798 | Grad Max: 0.107491 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000652 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004785 | Grad Max: 0.009963 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000283 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001381 | Grad Max: 0.003314 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001568 | Grad Max: 0.003252 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029954 | Grad Max: 0.029954 [GRADIENT NORM TOTAL] 7.4284 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.604 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023503 0.4976497] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 655/1393 | B: 555/1493 | C: 270/1778 [LOSS Ex1] A: 0.65238 | B: 0.65254 | C: 0.64815 [LOGITS Ex2 A] Mean Abs: 1.730 | Max: 5.575 [LOSS Ex2] A: 0.17806 | B: 0.40392 | C: 0.30219 ** [JOINT LOSS] ** : 0.945746 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004321 | Grad Max: 0.141460 -> Layer: shared_layers.0.bias | Grad Mean: 0.391682 | Grad Max: 1.888290 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002310 | Grad Max: 0.007275 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007911 | Grad Max: 0.007911 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002441 | Grad Max: 0.396415 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045784 | Grad Max: 2.227519 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000413 | Grad Max: 0.013161 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022146 | Grad Max: 0.109318 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000590 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004763 | Grad Max: 0.010000 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000292 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001384 | Grad Max: 0.003357 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001626 | Grad Max: 0.002874 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031693 | Grad Max: 0.031693 [GRADIENT NORM TOTAL] 8.0784 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.503 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041085 0.4958915] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.540 | Std: 0.049 [MASKS] A(Pass/Fail): 637/1411 | B: 568/1480 | C: 192/1184 [LOSS Ex1] A: 0.65899 | B: 0.65329 | C: 0.64767 [LOGITS Ex2 A] Mean Abs: 1.742 | Max: 6.067 [LOSS Ex2] A: 0.16596 | B: 0.38899 | C: 0.30213 ** [JOINT LOSS] ** : 0.939016 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002659 | Grad Max: 0.071114 -> Layer: shared_layers.0.bias | Grad Mean: 0.194236 | Grad Max: 0.970032 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002050 | Grad Max: 0.005473 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001616 | Grad Max: 0.001616 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001342 | Grad Max: 0.246155 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024733 | Grad Max: 1.377778 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000228 | Grad Max: 0.006988 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012167 | Grad Max: 0.055253 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000370 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002639 | Grad Max: 0.006071 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000180 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000765 | Grad Max: 0.001971 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000955 | Grad Max: 0.002282 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017740 | Grad Max: 0.017740 [GRADIENT NORM TOTAL] 4.2723 [EPOCH SUMMARY] Train Loss: 0.9347 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9180 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 90/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.090 | Max: 0.432 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53612417 0.46387586] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 627/1421 | B: 574/1474 | C: 283/1765 [LOSS Ex1] A: 0.65877 | B: 0.64958 | C: 0.64678 [LOGITS Ex2 A] Mean Abs: 1.777 | Max: 5.850 [LOSS Ex2] A: 0.17378 | B: 0.37204 | C: 0.27759 ** [JOINT LOSS] ** : 0.926179 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005912 | Grad Max: 0.151513 -> Layer: shared_layers.0.bias | Grad Mean: 0.371956 | Grad Max: 1.551225 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.006238 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002166 | Grad Max: 0.002166 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002311 | Grad Max: 0.236688 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042941 | Grad Max: 1.320596 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.012821 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021394 | Grad Max: 0.105086 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000602 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004759 | Grad Max: 0.009457 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000267 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001406 | Grad Max: 0.003068 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001631 | Grad Max: 0.003781 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032212 | Grad Max: 0.032212 [GRADIENT NORM TOTAL] 7.0313 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.568 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6739083 0.32609177] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 693/1355 | B: 524/1332 | C: 277/1771 [LOSS Ex1] A: 0.65423 | B: 0.65322 | C: 0.64856 [LOGITS Ex2 A] Mean Abs: 1.816 | Max: 5.455 [LOSS Ex2] A: 0.17030 | B: 0.37039 | C: 0.26911 ** [JOINT LOSS] ** : 0.921939 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008618 | Grad Max: 0.259520 -> Layer: shared_layers.0.bias | Grad Mean: 0.467522 | Grad Max: 2.011271 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002220 | Grad Max: 0.007016 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007053 | Grad Max: 0.007053 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003117 | Grad Max: 0.318813 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057393 | Grad Max: 1.718957 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000530 | Grad Max: 0.015307 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028025 | Grad Max: 0.127041 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000787 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006325 | Grad Max: 0.013274 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000362 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001855 | Grad Max: 0.004194 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002126 | Grad Max: 0.004490 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041464 | Grad Max: 0.041464 [GRADIENT NORM TOTAL] 9.1746 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.637 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004218 0.49957818] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 668/1380 | B: 556/1492 | C: 281/1767 [LOSS Ex1] A: 0.65946 | B: 0.65242 | C: 0.64670 [LOGITS Ex2 A] Mean Abs: 1.802 | Max: 5.809 [LOSS Ex2] A: 0.15919 | B: 0.38009 | C: 0.29885 ** [JOINT LOSS] ** : 0.932238 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003148 | Grad Max: 0.096211 -> Layer: shared_layers.0.bias | Grad Mean: 0.264672 | Grad Max: 1.238688 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.006571 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008529 | Grad Max: 0.008529 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001671 | Grad Max: 0.182948 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031453 | Grad Max: 1.012811 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.010566 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015594 | Grad Max: 0.084175 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000449 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003382 | Grad Max: 0.007113 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000206 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000977 | Grad Max: 0.002266 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001057 | Grad Max: 0.002720 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021313 | Grad Max: 0.021313 [GRADIENT NORM TOTAL] 5.3818 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.421 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6413189 0.35868105] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 654/1394 | B: 568/1480 | C: 290/1758 [LOSS Ex1] A: 0.65640 | B: 0.65317 | C: 0.64835 [LOGITS Ex2 A] Mean Abs: 1.751 | Max: 6.676 [LOSS Ex2] A: 0.16747 | B: 0.38758 | C: 0.31519 ** [JOINT LOSS] ** : 0.942723 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004757 | Grad Max: 0.111824 -> Layer: shared_layers.0.bias | Grad Mean: 0.286383 | Grad Max: 1.323589 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.006326 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000307 | Grad Max: 0.000307 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001871 | Grad Max: 0.225813 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034297 | Grad Max: 1.282915 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000328 | Grad Max: 0.010624 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017437 | Grad Max: 0.086837 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000519 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003932 | Grad Max: 0.008253 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000252 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001175 | Grad Max: 0.002902 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001446 | Grad Max: 0.002853 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027807 | Grad Max: 0.027807 [GRADIENT NORM TOTAL] 5.4802 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.496 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.58924496 0.41075504] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 538/1078 | B: 575/1473 | C: 286/1762 [LOSS Ex1] A: 0.65497 | B: 0.64946 | C: 0.64767 [LOGITS Ex2 A] Mean Abs: 1.791 | Max: 5.637 [LOSS Ex2] A: 0.16764 | B: 0.38130 | C: 0.27678 ** [JOINT LOSS] ** : 0.925942 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006263 | Grad Max: 0.158317 -> Layer: shared_layers.0.bias | Grad Mean: 0.411033 | Grad Max: 1.627846 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002223 | Grad Max: 0.006615 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005839 | Grad Max: 0.005839 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002628 | Grad Max: 0.310749 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048895 | Grad Max: 1.737463 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.014014 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024374 | Grad Max: 0.114914 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000709 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005432 | Grad Max: 0.011512 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000330 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001575 | Grad Max: 0.003803 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001859 | Grad Max: 0.003437 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035272 | Grad Max: 0.035272 [GRADIENT NORM TOTAL] 7.8220 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.638 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5069125 0.49308747] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 677/1371 | B: 524/1332 | C: 312/1736 [LOSS Ex1] A: 0.65537 | B: 0.65310 | C: 0.64566 [LOGITS Ex2 A] Mean Abs: 1.786 | Max: 6.650 [LOSS Ex2] A: 0.16188 | B: 0.36475 | C: 0.27266 ** [JOINT LOSS] ** : 0.917808 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.068718 -> Layer: shared_layers.0.bias | Grad Mean: 0.140995 | Grad Max: 0.585823 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006144 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001208 | Grad Max: 0.001208 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000847 | Grad Max: 0.154997 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015458 | Grad Max: 0.853149 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000142 | Grad Max: 0.005216 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007542 | Grad Max: 0.035297 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000281 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001647 | Grad Max: 0.004478 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000132 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000453 | Grad Max: 0.001353 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000574 | Grad Max: 0.001818 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009366 | Grad Max: 0.009366 [GRADIENT NORM TOTAL] 2.7910 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.590 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51024634 0.48975363] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.051 [MASKS] A(Pass/Fail): 664/1384 | B: 556/1492 | C: 264/1784 [LOSS Ex1] A: 0.65316 | B: 0.65230 | C: 0.65041 [LOGITS Ex2 A] Mean Abs: 1.828 | Max: 5.745 [LOSS Ex2] A: 0.16897 | B: 0.38907 | C: 0.31134 ** [JOINT LOSS] ** : 0.941751 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006583 | Grad Max: 0.154210 -> Layer: shared_layers.0.bias | Grad Mean: 0.418553 | Grad Max: 1.799149 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002259 | Grad Max: 0.007284 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008040 | Grad Max: 0.008040 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002750 | Grad Max: 0.274588 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051154 | Grad Max: 1.453287 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000465 | Grad Max: 0.014269 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024813 | Grad Max: 0.116356 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000774 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005502 | Grad Max: 0.011484 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000360 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001608 | Grad Max: 0.003996 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001858 | Grad Max: 0.003888 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036099 | Grad Max: 0.036099 [GRADIENT NORM TOTAL] 8.2558 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.608 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023442 0.4976558] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 656/1392 | B: 568/1480 | C: 287/1761 [LOSS Ex1] A: 0.65217 | B: 0.65306 | C: 0.64652 [LOGITS Ex2 A] Mean Abs: 1.823 | Max: 6.511 [LOSS Ex2] A: 0.19002 | B: 0.39549 | C: 0.30622 ** [JOINT LOSS] ** : 0.947825 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009009 | Grad Max: 0.240570 -> Layer: shared_layers.0.bias | Grad Mean: 0.576189 | Grad Max: 2.478792 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002248 | Grad Max: 0.007141 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002237 | Grad Max: 0.002237 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003785 | Grad Max: 0.392867 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070356 | Grad Max: 2.148390 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000637 | Grad Max: 0.019486 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033876 | Grad Max: 0.161356 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000936 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007528 | Grad Max: 0.015541 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000450 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002173 | Grad Max: 0.005475 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002432 | Grad Max: 0.004430 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046523 | Grad Max: 0.046523 [GRADIENT NORM TOTAL] 11.4041 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.506 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50406975 0.49593022] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.050 [MASKS] A(Pass/Fail): 637/1411 | B: 575/1473 | C: 294/1754 [LOSS Ex1] A: 0.65881 | B: 0.64934 | C: 0.64572 [LOGITS Ex2 A] Mean Abs: 1.777 | Max: 5.626 [LOSS Ex2] A: 0.16761 | B: 0.36920 | C: 0.28800 ** [JOINT LOSS] ** : 0.926227 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003383 | Grad Max: 0.095321 -> Layer: shared_layers.0.bias | Grad Mean: 0.299511 | Grad Max: 1.153817 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006452 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007100 | Grad Max: 0.007100 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001854 | Grad Max: 0.227946 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034940 | Grad Max: 1.275959 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000328 | Grad Max: 0.012257 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017619 | Grad Max: 0.092840 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000544 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003823 | Grad Max: 0.008191 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000224 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001116 | Grad Max: 0.002566 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001185 | Grad Max: 0.002985 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024440 | Grad Max: 0.024440 [GRADIENT NORM TOTAL] 5.9179 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.435 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53632414 0.4636759 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.048 [MASKS] A(Pass/Fail): 627/1421 | B: 524/1332 | C: 254/1794 [LOSS Ex1] A: 0.65859 | B: 0.65299 | C: 0.65039 [LOGITS Ex2 A] Mean Abs: 1.721 | Max: 5.986 [LOSS Ex2] A: 0.17158 | B: 0.36675 | C: 0.32686 ** [JOINT LOSS] ** : 0.942389 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004698 | Grad Max: 0.104231 -> Layer: shared_layers.0.bias | Grad Mean: 0.308859 | Grad Max: 1.344249 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002000 | Grad Max: 0.006191 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005708 | Grad Max: 0.005708 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002013 | Grad Max: 0.253213 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037635 | Grad Max: 1.407548 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000338 | Grad Max: 0.010457 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018071 | Grad Max: 0.082488 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000531 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003997 | Grad Max: 0.008374 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000245 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001171 | Grad Max: 0.002757 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001439 | Grad Max: 0.003014 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027066 | Grad Max: 0.027066 [GRADIENT NORM TOTAL] 6.1985 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.572 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6750052 0.32499477] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.051 [MASKS] A(Pass/Fail): 693/1355 | B: 557/1491 | C: 294/1754 [LOSS Ex1] A: 0.65402 | B: 0.65219 | C: 0.64674 [LOGITS Ex2 A] Mean Abs: 1.759 | Max: 6.763 [LOSS Ex2] A: 0.16666 | B: 0.41137 | C: 0.28548 ** [JOINT LOSS] ** : 0.938822 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005559 | Grad Max: 0.141036 -> Layer: shared_layers.0.bias | Grad Mean: 0.415461 | Grad Max: 1.864884 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.006946 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007340 | Grad Max: 0.007340 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002588 | Grad Max: 0.366275 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048358 | Grad Max: 2.058126 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000424 | Grad Max: 0.013804 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022738 | Grad Max: 0.109250 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000669 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005014 | Grad Max: 0.010691 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000295 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001461 | Grad Max: 0.003393 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001664 | Grad Max: 0.003186 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032336 | Grad Max: 0.032336 [GRADIENT NORM TOTAL] 8.5341 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.641 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50043046 0.49956954] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 670/1378 | B: 569/1479 | C: 309/1739 [LOSS Ex1] A: 0.65928 | B: 0.65294 | C: 0.64347 [LOGITS Ex2 A] Mean Abs: 1.784 | Max: 6.119 [LOSS Ex2] A: 0.15558 | B: 0.39447 | C: 0.27713 ** [JOINT LOSS] ** : 0.927628 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004024 | Grad Max: 0.125306 -> Layer: shared_layers.0.bias | Grad Mean: 0.230756 | Grad Max: 1.155406 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.005999 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001877 | Grad Max: 0.001877 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001511 | Grad Max: 0.247612 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027624 | Grad Max: 1.390795 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000224 | Grad Max: 0.007282 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011994 | Grad Max: 0.059262 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000361 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002723 | Grad Max: 0.006421 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000195 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000805 | Grad Max: 0.002132 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000989 | Grad Max: 0.002334 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018445 | Grad Max: 0.018445 [GRADIENT NORM TOTAL] 4.9208 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.424 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64209425 0.35790578] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.051 [MASKS] A(Pass/Fail): 654/1394 | B: 575/1473 | C: 263/1785 [LOSS Ex1] A: 0.65620 | B: 0.64922 | C: 0.64938 [LOGITS Ex2 A] Mean Abs: 1.804 | Max: 6.039 [LOSS Ex2] A: 0.17263 | B: 0.36785 | C: 0.29416 ** [JOINT LOSS] ** : 0.929816 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002927 | Grad Max: 0.095361 -> Layer: shared_layers.0.bias | Grad Mean: 0.223487 | Grad Max: 1.177606 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006235 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000567 | Grad Max: 0.000567 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001488 | Grad Max: 0.173193 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027096 | Grad Max: 0.954938 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000255 | Grad Max: 0.008203 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013496 | Grad Max: 0.061773 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000451 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002987 | Grad Max: 0.007076 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000207 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000880 | Grad Max: 0.002188 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001054 | Grad Max: 0.002516 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020654 | Grad Max: 0.020654 [GRADIENT NORM TOTAL] 4.6808 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.499 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5896842 0.41031575] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 539/1077 | B: 524/1332 | C: 164/1212 [LOSS Ex1] A: 0.65477 | B: 0.65288 | C: 0.64996 [LOGITS Ex2 A] Mean Abs: 1.854 | Max: 6.277 [LOSS Ex2] A: 0.15483 | B: 0.36193 | C: 0.29849 ** [JOINT LOSS] ** : 0.924284 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003692 | Grad Max: 0.100178 -> Layer: shared_layers.0.bias | Grad Mean: 0.282581 | Grad Max: 1.260493 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.006505 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006290 | Grad Max: 0.006290 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001892 | Grad Max: 0.178317 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035321 | Grad Max: 0.989205 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000317 | Grad Max: 0.011064 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017034 | Grad Max: 0.089723 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000552 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003744 | Grad Max: 0.008659 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000225 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001080 | Grad Max: 0.002525 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001160 | Grad Max: 0.002980 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023443 | Grad Max: 0.023443 [GRADIENT NORM TOTAL] 5.7097 [EPOCH SUMMARY] Train Loss: 0.9318 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9104 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9143 -> New: 0.9104) ############################## EPOCH 91/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.642 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50693864 0.49306136] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 677/1371 | B: 557/1491 | C: 271/1777 [LOSS Ex1] A: 0.65517 | B: 0.65207 | C: 0.64890 [LOGITS Ex2 A] Mean Abs: 1.804 | Max: 6.409 [LOSS Ex2] A: 0.15922 | B: 0.38517 | C: 0.29861 ** [JOINT LOSS] ** : 0.933049 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001850 | Grad Max: 0.052655 -> Layer: shared_layers.0.bias | Grad Mean: 0.037736 | Grad Max: 0.142953 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.006195 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000490 | Grad Max: 0.000490 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000413 | Grad Max: 0.133576 -> Layer: exit2_layers.0.bias | Grad Mean: 0.006879 | Grad Max: 0.732717 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000040 | Grad Max: 0.002450 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001488 | Grad Max: 0.012846 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000153 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000265 | Grad Max: 0.001622 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000055 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000075 | Grad Max: 0.000469 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000334 | Grad Max: 0.000973 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000040 | Grad Max: 0.000040 [GRADIENT NORM TOTAL] 1.3367 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.593 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51026 0.48973998] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 664/1384 | B: 569/1479 | C: 275/1773 [LOSS Ex1] A: 0.65294 | B: 0.65282 | C: 0.64895 [LOGITS Ex2 A] Mean Abs: 1.782 | Max: 6.107 [LOSS Ex2] A: 0.16175 | B: 0.39879 | C: 0.29624 ** [JOINT LOSS] ** : 0.937161 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003027 | Grad Max: 0.099156 -> Layer: shared_layers.0.bias | Grad Mean: 0.312068 | Grad Max: 1.365302 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002239 | Grad Max: 0.007160 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006480 | Grad Max: 0.006480 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001952 | Grad Max: 0.257526 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036483 | Grad Max: 1.436378 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.010856 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018080 | Grad Max: 0.079522 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000559 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003968 | Grad Max: 0.008357 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000251 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001160 | Grad Max: 0.002909 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001335 | Grad Max: 0.002562 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025859 | Grad Max: 0.025859 [GRADIENT NORM TOTAL] 6.1694 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.612 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023902 0.49760976] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 656/1392 | B: 575/1473 | C: 252/1796 [LOSS Ex1] A: 0.65195 | B: 0.64908 | C: 0.65014 [LOGITS Ex2 A] Mean Abs: 1.773 | Max: 6.965 [LOSS Ex2] A: 0.17945 | B: 0.37247 | C: 0.27531 ** [JOINT LOSS] ** : 0.926132 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003416 | Grad Max: 0.083281 -> Layer: shared_layers.0.bias | Grad Mean: 0.253905 | Grad Max: 1.034825 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002259 | Grad Max: 0.006862 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000903 | Grad Max: 0.000903 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001508 | Grad Max: 0.209436 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028003 | Grad Max: 1.151640 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.011137 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014826 | Grad Max: 0.092071 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000473 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003227 | Grad Max: 0.007158 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000213 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000930 | Grad Max: 0.002288 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001076 | Grad Max: 0.002455 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020562 | Grad Max: 0.020562 [GRADIENT NORM TOTAL] 4.6939 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.509 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.504078 0.495922] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.050 [MASKS] A(Pass/Fail): 637/1411 | B: 527/1329 | C: 272/1776 [LOSS Ex1] A: 0.65861 | B: 0.65274 | C: 0.64887 [LOGITS Ex2 A] Mean Abs: 1.769 | Max: 5.525 [LOSS Ex2] A: 0.15993 | B: 0.36659 | C: 0.28308 ** [JOINT LOSS] ** : 0.923274 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003158 | Grad Max: 0.078901 -> Layer: shared_layers.0.bias | Grad Mean: 0.189416 | Grad Max: 0.906628 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.005657 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001132 | Grad Max: 0.001132 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001206 | Grad Max: 0.170423 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021879 | Grad Max: 0.957758 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000173 | Grad Max: 0.005595 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009231 | Grad Max: 0.043727 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000345 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002070 | Grad Max: 0.004872 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000138 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000608 | Grad Max: 0.001519 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000676 | Grad Max: 0.002159 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013920 | Grad Max: 0.013920 [GRADIENT NORM TOTAL] 4.0012 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.091 | Max: 0.438 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53646034 0.4635397 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.540 | Std: 0.049 [MASKS] A(Pass/Fail): 628/1420 | B: 559/1489 | C: 290/1758 [LOSS Ex1] A: 0.65839 | B: 0.65193 | C: 0.64660 [LOGITS Ex2 A] Mean Abs: 1.757 | Max: 6.197 [LOSS Ex2] A: 0.17774 | B: 0.38329 | C: 0.30673 ** [JOINT LOSS] ** : 0.941560 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004294 | Grad Max: 0.112279 -> Layer: shared_layers.0.bias | Grad Mean: 0.266477 | Grad Max: 1.223575 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005916 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004871 | Grad Max: 0.004871 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001695 | Grad Max: 0.223539 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031362 | Grad Max: 1.162058 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000276 | Grad Max: 0.010015 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014797 | Grad Max: 0.069218 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000427 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003203 | Grad Max: 0.006955 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000215 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000923 | Grad Max: 0.002328 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001004 | Grad Max: 0.002308 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019795 | Grad Max: 0.019795 [GRADIENT NORM TOTAL] 5.4068 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.575 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6761805 0.3238195] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.051 [MASKS] A(Pass/Fail): 693/1355 | B: 570/1478 | C: 304/1744 [LOSS Ex1] A: 0.65380 | B: 0.65268 | C: 0.64450 [LOGITS Ex2 A] Mean Abs: 1.789 | Max: 5.836 [LOSS Ex2] A: 0.15463 | B: 0.38336 | C: 0.28121 ** [JOINT LOSS] ** : 0.923390 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001705 | Grad Max: 0.038782 -> Layer: shared_layers.0.bias | Grad Mean: 0.054890 | Grad Max: 0.222208 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002267 | Grad Max: 0.007106 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006481 | Grad Max: 0.006481 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000422 | Grad Max: 0.192756 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007135 | Grad Max: 1.086710 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000040 | Grad Max: 0.002700 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001369 | Grad Max: 0.014025 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000136 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000243 | Grad Max: 0.001715 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000059 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000079 | Grad Max: 0.000490 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000482 | Grad Max: 0.001126 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001452 | Grad Max: 0.001452 [GRADIENT NORM TOTAL] 1.8972 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.645 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004171 0.4995829] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.050 [MASKS] A(Pass/Fail): 671/1377 | B: 575/1473 | C: 286/1762 [LOSS Ex1] A: 0.65907 | B: 0.64892 | C: 0.64817 [LOGITS Ex2 A] Mean Abs: 1.777 | Max: 5.779 [LOSS Ex2] A: 0.15438 | B: 0.36510 | C: 0.29718 ** [JOINT LOSS] ** : 0.924274 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005088 | Grad Max: 0.148960 -> Layer: shared_layers.0.bias | Grad Mean: 0.208430 | Grad Max: 0.969832 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.006052 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004571 | Grad Max: 0.004571 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001434 | Grad Max: 0.235267 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026066 | Grad Max: 1.312161 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.007037 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012094 | Grad Max: 0.056557 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000465 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002757 | Grad Max: 0.006749 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000190 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000804 | Grad Max: 0.001972 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001010 | Grad Max: 0.002513 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018225 | Grad Max: 0.018225 [GRADIENT NORM TOTAL] 4.3957 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.427 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6430642 0.35693574] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.051 [MASKS] A(Pass/Fail): 654/1394 | B: 528/1328 | C: 310/1738 [LOSS Ex1] A: 0.65596 | B: 0.65259 | C: 0.64428 [LOGITS Ex2 A] Mean Abs: 1.794 | Max: 6.331 [LOSS Ex2] A: 0.17281 | B: 0.35883 | C: 0.27963 ** [JOINT LOSS] ** : 0.921369 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004037 | Grad Max: 0.130079 -> Layer: shared_layers.0.bias | Grad Mean: 0.076718 | Grad Max: 0.376148 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.006346 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000514 | Grad Max: 0.000514 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000649 | Grad Max: 0.081457 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010855 | Grad Max: 0.423184 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.003305 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003916 | Grad Max: 0.023704 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000249 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000925 | Grad Max: 0.003039 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000097 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000257 | Grad Max: 0.000851 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000460 | Grad Max: 0.001651 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005073 | Grad Max: 0.005073 [GRADIENT NORM TOTAL] 1.7215 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.503 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.590244 0.40975603] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.051 [MASKS] A(Pass/Fail): 541/1075 | B: 561/1487 | C: 319/1729 [LOSS Ex1] A: 0.65450 | B: 0.65176 | C: 0.64188 [LOGITS Ex2 A] Mean Abs: 1.878 | Max: 6.110 [LOSS Ex2] A: 0.15217 | B: 0.38676 | C: 0.28578 ** [JOINT LOSS] ** : 0.924281 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004083 | Grad Max: 0.121803 -> Layer: shared_layers.0.bias | Grad Mean: 0.325477 | Grad Max: 1.489616 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.006966 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006330 | Grad Max: 0.006330 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002053 | Grad Max: 0.199119 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038172 | Grad Max: 1.100857 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000340 | Grad Max: 0.009705 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018438 | Grad Max: 0.078121 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000499 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004049 | Grad Max: 0.008848 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000226 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001168 | Grad Max: 0.002572 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001226 | Grad Max: 0.002748 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025054 | Grad Max: 0.025054 [GRADIENT NORM TOTAL] 6.2687 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.646 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5069767 0.49302328] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.052 [MASKS] A(Pass/Fail): 677/1371 | B: 570/1478 | C: 285/1763 [LOSS Ex1] A: 0.65490 | B: 0.65250 | C: 0.64782 [LOGITS Ex2 A] Mean Abs: 1.850 | Max: 6.611 [LOSS Ex2] A: 0.16891 | B: 0.39186 | C: 0.28289 ** [JOINT LOSS] ** : 0.932960 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006082 | Grad Max: 0.186873 -> Layer: shared_layers.0.bias | Grad Mean: 0.382505 | Grad Max: 1.673456 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.006709 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006682 | Grad Max: 0.006682 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002528 | Grad Max: 0.270969 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046557 | Grad Max: 1.453599 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000428 | Grad Max: 0.012188 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022943 | Grad Max: 0.100090 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000630 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005095 | Grad Max: 0.010955 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000310 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001472 | Grad Max: 0.003715 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001636 | Grad Max: 0.003456 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031935 | Grad Max: 0.031935 [GRADIENT NORM TOTAL] 7.5072 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.598 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5102018 0.4897982] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 665/1383 | B: 575/1473 | C: 278/1770 [LOSS Ex1] A: 0.65265 | B: 0.64874 | C: 0.64832 [LOGITS Ex2 A] Mean Abs: 1.822 | Max: 5.656 [LOSS Ex2] A: 0.16123 | B: 0.36714 | C: 0.29864 ** [JOINT LOSS] ** : 0.925572 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004957 | Grad Max: 0.180381 -> Layer: shared_layers.0.bias | Grad Mean: 0.077351 | Grad Max: 0.366577 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006851 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004600 | Grad Max: 0.004600 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000708 | Grad Max: 0.097931 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010861 | Grad Max: 0.542438 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.003288 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002936 | Grad Max: 0.021758 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000212 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000779 | Grad Max: 0.002827 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000101 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000236 | Grad Max: 0.000754 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000406 | Grad Max: 0.001332 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005600 | Grad Max: 0.005600 [GRADIENT NORM TOTAL] 1.8627 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.617 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50247395 0.49752602] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 657/1391 | B: 530/1326 | C: 290/1758 [LOSS Ex1] A: 0.65164 | B: 0.65240 | C: 0.64786 [LOGITS Ex2 A] Mean Abs: 1.763 | Max: 6.968 [LOSS Ex2] A: 0.18579 | B: 0.38944 | C: 0.29523 ** [JOINT LOSS] ** : 0.940790 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005287 | Grad Max: 0.158571 -> Layer: shared_layers.0.bias | Grad Mean: 0.492332 | Grad Max: 2.257746 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.007609 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007978 | Grad Max: 0.007978 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003027 | Grad Max: 0.337989 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056935 | Grad Max: 1.897598 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000519 | Grad Max: 0.017684 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028103 | Grad Max: 0.148489 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000730 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006093 | Grad Max: 0.012983 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000369 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001740 | Grad Max: 0.004411 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001915 | Grad Max: 0.003449 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037953 | Grad Max: 0.037953 [GRADIENT NORM TOTAL] 9.6642 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.513 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.504084 0.495916] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.050 [MASKS] A(Pass/Fail): 637/1411 | B: 562/1486 | C: 304/1744 [LOSS Ex1] A: 0.65834 | B: 0.65157 | C: 0.64317 [LOGITS Ex2 A] Mean Abs: 1.755 | Max: 6.152 [LOSS Ex2] A: 0.16453 | B: 0.41998 | C: 0.30292 ** [JOINT LOSS] ** : 0.946832 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009461 | Grad Max: 0.215220 -> Layer: shared_layers.0.bias | Grad Mean: 0.620826 | Grad Max: 2.662034 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.006122 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005347 | Grad Max: 0.005347 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004053 | Grad Max: 0.421919 -> Layer: exit2_layers.0.bias | Grad Mean: 0.076124 | Grad Max: 2.323021 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000708 | Grad Max: 0.020869 -> Layer: exit2_layers.3.bias | Grad Mean: 0.038181 | Grad Max: 0.178900 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001029 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008384 | Grad Max: 0.016848 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000477 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002410 | Grad Max: 0.005349 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002842 | Grad Max: 0.005225 -> Layer: exit2_layers.12.bias | Grad Mean: 0.053375 | Grad Max: 0.053375 [GRADIENT NORM TOTAL] 12.1599 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.443 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5366132 0.46338683] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 629/1419 | B: 570/1478 | C: 176/1200 [LOSS Ex1] A: 0.65811 | B: 0.65231 | C: 0.64940 [LOGITS Ex2 A] Mean Abs: 1.744 | Max: 6.051 [LOSS Ex2] A: 0.16804 | B: 0.39112 | C: 0.27477 ** [JOINT LOSS] ** : 0.931255 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005803 | Grad Max: 0.148400 -> Layer: shared_layers.0.bias | Grad Mean: 0.312755 | Grad Max: 1.324983 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.005904 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000446 | Grad Max: 0.000446 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.241350 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038718 | Grad Max: 1.353927 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.010835 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019171 | Grad Max: 0.087482 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000547 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004273 | Grad Max: 0.009307 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000266 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001233 | Grad Max: 0.003224 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001470 | Grad Max: 0.003140 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027220 | Grad Max: 0.027220 [GRADIENT NORM TOTAL] 6.0220 [EPOCH SUMMARY] Train Loss: 0.9308 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9156 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 92/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.579 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.67777485 0.32222512] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.052 [MASKS] A(Pass/Fail): 693/1355 | B: 574/1474 | C: 286/1762 [LOSS Ex1] A: 0.65349 | B: 0.64854 | C: 0.64865 [LOGITS Ex2 A] Mean Abs: 1.859 | Max: 6.006 [LOSS Ex2] A: 0.15514 | B: 0.36985 | C: 0.28355 ** [JOINT LOSS] ** : 0.919742 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003667 | Grad Max: 0.128812 -> Layer: shared_layers.0.bias | Grad Mean: 0.339979 | Grad Max: 1.608904 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002284 | Grad Max: 0.006945 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009336 | Grad Max: 0.009336 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.239315 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040262 | Grad Max: 1.332754 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000374 | Grad Max: 0.014221 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020307 | Grad Max: 0.117879 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000615 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004426 | Grad Max: 0.009548 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000254 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001271 | Grad Max: 0.002977 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001456 | Grad Max: 0.003539 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028453 | Grad Max: 0.028453 [GRADIENT NORM TOTAL] 6.9628 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.649 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003976 0.49960235] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 673/1375 | B: 530/1326 | C: 272/1776 [LOSS Ex1] A: 0.65880 | B: 0.65222 | C: 0.64765 [LOGITS Ex2 A] Mean Abs: 1.893 | Max: 5.572 [LOSS Ex2] A: 0.15918 | B: 0.37306 | C: 0.31210 ** [JOINT LOSS] ** : 0.934336 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005082 | Grad Max: 0.191642 -> Layer: shared_layers.0.bias | Grad Mean: 0.561436 | Grad Max: 2.533338 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.006198 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007091 | Grad Max: 0.007091 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003437 | Grad Max: 0.385297 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064882 | Grad Max: 2.161893 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000606 | Grad Max: 0.022189 -> Layer: exit2_layers.3.bias | Grad Mean: 0.032980 | Grad Max: 0.187524 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000888 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007061 | Grad Max: 0.014494 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000385 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002030 | Grad Max: 0.004572 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002262 | Grad Max: 0.004205 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044619 | Grad Max: 0.044619 [GRADIENT NORM TOTAL] 11.2940 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.432 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64423674 0.35576323] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.051 [MASKS] A(Pass/Fail): 654/1394 | B: 561/1487 | C: 290/1758 [LOSS Ex1] A: 0.65567 | B: 0.65140 | C: 0.64852 [LOGITS Ex2 A] Mean Abs: 1.876 | Max: 6.147 [LOSS Ex2] A: 0.17275 | B: 0.39255 | C: 0.29798 ** [JOINT LOSS] ** : 0.939618 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004016 | Grad Max: 0.128612 -> Layer: shared_layers.0.bias | Grad Mean: 0.381852 | Grad Max: 1.698839 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.006175 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005037 | Grad Max: 0.005037 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002358 | Grad Max: 0.281142 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043765 | Grad Max: 1.550687 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.015715 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022288 | Grad Max: 0.129662 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000553 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004754 | Grad Max: 0.009547 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000267 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001371 | Grad Max: 0.003123 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001548 | Grad Max: 0.003109 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030472 | Grad Max: 0.030472 [GRADIENT NORM TOTAL] 7.7432 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.507 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59096104 0.409039 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.052 [MASKS] A(Pass/Fail): 542/1074 | B: 568/1480 | C: 289/1759 [LOSS Ex1] A: 0.65422 | B: 0.65215 | C: 0.64743 [LOGITS Ex2 A] Mean Abs: 1.884 | Max: 5.802 [LOSS Ex2] A: 0.16582 | B: 0.38353 | C: 0.30158 ** [JOINT LOSS] ** : 0.934910 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004277 | Grad Max: 0.153335 -> Layer: shared_layers.0.bias | Grad Mean: 0.069341 | Grad Max: 0.386718 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.006357 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004784 | Grad Max: 0.004784 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000710 | Grad Max: 0.163362 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011447 | Grad Max: 0.919861 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.003606 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003537 | Grad Max: 0.026139 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000184 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000886 | Grad Max: 0.002659 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000097 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000284 | Grad Max: 0.000851 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000535 | Grad Max: 0.001459 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008038 | Grad Max: 0.008038 [GRADIENT NORM TOTAL] 2.0138 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.651 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5069954 0.49300456] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.544 | Std: 0.052 [MASKS] A(Pass/Fail): 678/1370 | B: 574/1474 | C: 285/1763 [LOSS Ex1] A: 0.65462 | B: 0.64838 | C: 0.64502 [LOGITS Ex2 A] Mean Abs: 1.845 | Max: 5.966 [LOSS Ex2] A: 0.15724 | B: 0.37078 | C: 0.29139 ** [JOINT LOSS] ** : 0.922477 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005197 | Grad Max: 0.154039 -> Layer: shared_layers.0.bias | Grad Mean: 0.281595 | Grad Max: 1.280005 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002301 | Grad Max: 0.006672 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002254 | Grad Max: 0.002254 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001869 | Grad Max: 0.270828 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034429 | Grad Max: 1.520162 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000317 | Grad Max: 0.009581 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017041 | Grad Max: 0.085095 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000501 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003780 | Grad Max: 0.008595 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000243 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001088 | Grad Max: 0.002437 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001298 | Grad Max: 0.002644 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024479 | Grad Max: 0.024479 [GRADIENT NORM TOTAL] 5.6071 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.603 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5102565 0.48974347] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.052 [MASKS] A(Pass/Fail): 665/1383 | B: 530/1326 | C: 288/1760 [LOSS Ex1] A: 0.65236 | B: 0.65207 | C: 0.64421 [LOGITS Ex2 A] Mean Abs: 1.846 | Max: 6.658 [LOSS Ex2] A: 0.15899 | B: 0.35861 | C: 0.27957 ** [JOINT LOSS] ** : 0.915270 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002564 | Grad Max: 0.072754 -> Layer: shared_layers.0.bias | Grad Mean: 0.124360 | Grad Max: 0.646574 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002304 | Grad Max: 0.007165 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007611 | Grad Max: 0.007611 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000850 | Grad Max: 0.278419 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014817 | Grad Max: 1.565183 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000092 | Grad Max: 0.004689 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004523 | Grad Max: 0.033185 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000200 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000872 | Grad Max: 0.003131 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000092 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000239 | Grad Max: 0.000838 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000368 | Grad Max: 0.001223 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004972 | Grad Max: 0.004972 [GRADIENT NORM TOTAL] 3.0980 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.622 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5024591 0.49754086] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 658/1390 | B: 561/1487 | C: 299/1749 [LOSS Ex1] A: 0.65136 | B: 0.65124 | C: 0.64648 [LOGITS Ex2 A] Mean Abs: 1.865 | Max: 7.097 [LOSS Ex2] A: 0.18188 | B: 0.38529 | C: 0.28976 ** [JOINT LOSS] ** : 0.935338 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007570 | Grad Max: 0.294624 -> Layer: shared_layers.0.bias | Grad Mean: 0.268254 | Grad Max: 1.196448 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.006576 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002533 | Grad Max: 0.002533 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001990 | Grad Max: 0.188115 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035470 | Grad Max: 0.967117 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000328 | Grad Max: 0.009353 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017001 | Grad Max: 0.073239 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000538 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003868 | Grad Max: 0.008324 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000279 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001105 | Grad Max: 0.002830 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001226 | Grad Max: 0.002574 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022825 | Grad Max: 0.022825 [GRADIENT NORM TOTAL] 5.1843 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.517 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040335 0.4959665] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.541 | Std: 0.051 [MASKS] A(Pass/Fail): 639/1409 | B: 568/1480 | C: 275/1773 [LOSS Ex1] A: 0.65809 | B: 0.65199 | C: 0.64710 [LOGITS Ex2 A] Mean Abs: 1.830 | Max: 6.043 [LOSS Ex2] A: 0.16440 | B: 0.38289 | C: 0.27157 ** [JOINT LOSS] ** : 0.925346 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004814 | Grad Max: 0.142152 -> Layer: shared_layers.0.bias | Grad Mean: 0.165591 | Grad Max: 0.753873 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.005676 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005020 | Grad Max: 0.005020 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001202 | Grad Max: 0.180513 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021584 | Grad Max: 0.934202 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000175 | Grad Max: 0.004630 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009056 | Grad Max: 0.041225 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000325 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002073 | Grad Max: 0.004968 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000154 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000598 | Grad Max: 0.001504 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000691 | Grad Max: 0.001983 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012819 | Grad Max: 0.012819 [GRADIENT NORM TOTAL] 3.5458 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.092 | Max: 0.447 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5368057 0.46319425] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.049 [MASKS] A(Pass/Fail): 629/1419 | B: 574/1474 | C: 316/1732 [LOSS Ex1] A: 0.65787 | B: 0.64821 | C: 0.64249 [LOGITS Ex2 A] Mean Abs: 1.767 | Max: 6.467 [LOSS Ex2] A: 0.16605 | B: 0.36560 | C: 0.27598 ** [JOINT LOSS] ** : 0.918730 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002956 | Grad Max: 0.066338 -> Layer: shared_layers.0.bias | Grad Mean: 0.211834 | Grad Max: 0.774864 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002243 | Grad Max: 0.006985 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008664 | Grad Max: 0.008664 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001353 | Grad Max: 0.228826 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025063 | Grad Max: 1.297519 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000234 | Grad Max: 0.007038 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012679 | Grad Max: 0.064054 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000397 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002790 | Grad Max: 0.006172 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000179 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000804 | Grad Max: 0.002052 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000912 | Grad Max: 0.002290 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017517 | Grad Max: 0.017517 [GRADIENT NORM TOTAL] 4.2809 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.584 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.679265 0.32073498] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.052 [MASKS] A(Pass/Fail): 694/1354 | B: 530/1326 | C: 293/1755 [LOSS Ex1] A: 0.65322 | B: 0.65191 | C: 0.64935 [LOGITS Ex2 A] Mean Abs: 1.821 | Max: 6.047 [LOSS Ex2] A: 0.15875 | B: 0.37029 | C: 0.31864 ** [JOINT LOSS] ** : 0.934051 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002368 | Grad Max: 0.078193 -> Layer: shared_layers.0.bias | Grad Mean: 0.253054 | Grad Max: 1.076362 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.006888 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007903 | Grad Max: 0.007903 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001464 | Grad Max: 0.230275 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027170 | Grad Max: 1.291071 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.009161 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013900 | Grad Max: 0.067334 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000418 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003021 | Grad Max: 0.006504 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000242 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000869 | Grad Max: 0.002275 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000981 | Grad Max: 0.002050 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019146 | Grad Max: 0.019146 [GRADIENT NORM TOTAL] 4.8876 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.655 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50040823 0.49959177] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.051 [MASKS] A(Pass/Fail): 677/1371 | B: 564/1484 | C: 272/1776 [LOSS Ex1] A: 0.65855 | B: 0.65108 | C: 0.64695 [LOGITS Ex2 A] Mean Abs: 1.851 | Max: 5.614 [LOSS Ex2] A: 0.15595 | B: 0.38089 | C: 0.27419 ** [JOINT LOSS] ** : 0.922538 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002971 | Grad Max: 0.099688 -> Layer: shared_layers.0.bias | Grad Mean: 0.119334 | Grad Max: 0.558446 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.006445 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006482 | Grad Max: 0.006482 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000883 | Grad Max: 0.098125 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015048 | Grad Max: 0.537029 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.006388 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005963 | Grad Max: 0.041264 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000276 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001152 | Grad Max: 0.003678 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000092 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000327 | Grad Max: 0.000922 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000464 | Grad Max: 0.001588 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006809 | Grad Max: 0.006809 [GRADIENT NORM TOTAL] 2.5326 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.436 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64538574 0.3546142 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.052 [MASKS] A(Pass/Fail): 658/1390 | B: 568/1480 | C: 290/1758 [LOSS Ex1] A: 0.65539 | B: 0.65183 | C: 0.64849 [LOGITS Ex2 A] Mean Abs: 1.838 | Max: 5.875 [LOSS Ex2] A: 0.17367 | B: 0.37886 | C: 0.30794 ** [JOINT LOSS] ** : 0.938729 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.068216 -> Layer: shared_layers.0.bias | Grad Mean: 0.157219 | Grad Max: 0.888710 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002088 | Grad Max: 0.006228 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001132 | Grad Max: 0.001132 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001063 | Grad Max: 0.160097 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018832 | Grad Max: 0.895306 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000146 | Grad Max: 0.005882 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007734 | Grad Max: 0.045255 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000247 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001598 | Grad Max: 0.004417 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000110 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000446 | Grad Max: 0.001160 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000419 | Grad Max: 0.001335 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008590 | Grad Max: 0.008590 [GRADIENT NORM TOTAL] 3.4145 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.511 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5916473 0.4083527] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.052 [MASKS] A(Pass/Fail): 544/1072 | B: 574/1474 | C: 292/1756 [LOSS Ex1] A: 0.65393 | B: 0.64803 | C: 0.64540 [LOGITS Ex2 A] Mean Abs: 1.863 | Max: 5.853 [LOSS Ex2] A: 0.15106 | B: 0.36794 | C: 0.28996 ** [JOINT LOSS] ** : 0.918773 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004178 | Grad Max: 0.114518 -> Layer: shared_layers.0.bias | Grad Mean: 0.169060 | Grad Max: 0.893915 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002279 | Grad Max: 0.006544 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005485 | Grad Max: 0.005485 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001213 | Grad Max: 0.192588 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022316 | Grad Max: 1.094017 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000202 | Grad Max: 0.006237 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010760 | Grad Max: 0.048084 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000353 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002395 | Grad Max: 0.005685 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000177 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000685 | Grad Max: 0.001709 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000867 | Grad Max: 0.001996 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015484 | Grad Max: 0.015484 [GRADIENT NORM TOTAL] 3.5422 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.656 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50704396 0.4929561 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.544 | Std: 0.052 [MASKS] A(Pass/Fail): 678/1370 | B: 530/1326 | C: 221/1155 [LOSS Ex1] A: 0.65434 | B: 0.65174 | C: 0.63927 [LOGITS Ex2 A] Mean Abs: 1.838 | Max: 6.722 [LOSS Ex2] A: 0.16645 | B: 0.35847 | C: 0.28398 ** [JOINT LOSS] ** : 0.918083 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001895 | Grad Max: 0.069143 -> Layer: shared_layers.0.bias | Grad Mean: 0.053644 | Grad Max: 0.232167 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002274 | Grad Max: 0.006422 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003359 | Grad Max: 0.003359 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000482 | Grad Max: 0.105066 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007950 | Grad Max: 0.584918 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.002901 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002437 | Grad Max: 0.023816 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000222 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000530 | Grad Max: 0.002255 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000160 | Grad Max: 0.000595 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.001369 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003989 | Grad Max: 0.003989 [GRADIENT NORM TOTAL] 1.4640 [EPOCH SUMMARY] Train Loss: 0.9270 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9066 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9104 -> New: 0.9066) ############################## EPOCH 93/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.608 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5102843 0.48971567] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.544 | Std: 0.052 [MASKS] A(Pass/Fail): 665/1383 | B: 566/1482 | C: 313/1735 [LOSS Ex1] A: 0.65205 | B: 0.65090 | C: 0.64309 [LOGITS Ex2 A] Mean Abs: 1.843 | Max: 6.004 [LOSS Ex2] A: 0.15882 | B: 0.38725 | C: 0.27441 ** [JOINT LOSS] ** : 0.922175 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003158 | Grad Max: 0.103675 -> Layer: shared_layers.0.bias | Grad Mean: 0.079199 | Grad Max: 0.327341 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.006802 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003918 | Grad Max: 0.003918 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000581 | Grad Max: 0.167401 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009151 | Grad Max: 0.900169 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.003589 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001999 | Grad Max: 0.018306 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000163 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000322 | Grad Max: 0.001739 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000061 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000091 | Grad Max: 0.000553 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000283 | Grad Max: 0.000940 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001749 | Grad Max: 0.001749 [GRADIENT NORM TOTAL] 2.0753 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.627 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50248706 0.497513 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.052 [MASKS] A(Pass/Fail): 661/1387 | B: 568/1480 | C: 298/1750 [LOSS Ex1] A: 0.65104 | B: 0.65164 | C: 0.64534 [LOGITS Ex2 A] Mean Abs: 1.847 | Max: 6.486 [LOSS Ex2] A: 0.17422 | B: 0.38445 | C: 0.28379 ** [JOINT LOSS] ** : 0.930159 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003805 | Grad Max: 0.176372 -> Layer: shared_layers.0.bias | Grad Mean: 0.086100 | Grad Max: 0.330938 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.007153 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004991 | Grad Max: 0.004991 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000788 | Grad Max: 0.277500 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012200 | Grad Max: 1.553403 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.003341 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003173 | Grad Max: 0.020364 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000210 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000844 | Grad Max: 0.002746 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000097 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000257 | Grad Max: 0.000822 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000321 | Grad Max: 0.001186 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005605 | Grad Max: 0.005605 [GRADIENT NORM TOTAL] 2.6806 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.521 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5039947 0.49600533] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.051 [MASKS] A(Pass/Fail): 643/1405 | B: 575/1473 | C: 338/1710 [LOSS Ex1] A: 0.65781 | B: 0.64783 | C: 0.64287 [LOGITS Ex2 A] Mean Abs: 1.812 | Max: 7.042 [LOSS Ex2] A: 0.15903 | B: 0.36646 | C: 0.28254 ** [JOINT LOSS] ** : 0.918850 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.050671 -> Layer: shared_layers.0.bias | Grad Mean: 0.068912 | Grad Max: 0.328047 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006131 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003249 | Grad Max: 0.003249 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000542 | Grad Max: 0.206713 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009309 | Grad Max: 1.158949 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.003000 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002546 | Grad Max: 0.022085 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000146 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000507 | Grad Max: 0.002246 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000143 | Grad Max: 0.000648 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000391 | Grad Max: 0.001146 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002982 | Grad Max: 0.002982 [GRADIENT NORM TOTAL] 2.1317 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.093 | Max: 0.451 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53708684 0.46291316] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.541 | Std: 0.050 [MASKS] A(Pass/Fail): 634/1414 | B: 530/1326 | C: 309/1739 [LOSS Ex1] A: 0.65758 | B: 0.65153 | C: 0.64526 [LOGITS Ex2 A] Mean Abs: 1.820 | Max: 6.467 [LOSS Ex2] A: 0.17223 | B: 0.36018 | C: 0.29376 ** [JOINT LOSS] ** : 0.926847 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003066 | Grad Max: 0.099813 -> Layer: shared_layers.0.bias | Grad Mean: 0.076162 | Grad Max: 0.292323 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.006115 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003572 | Grad Max: 0.003572 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000628 | Grad Max: 0.102752 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010110 | Grad Max: 0.566351 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.004246 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002133 | Grad Max: 0.021278 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000119 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000307 | Grad Max: 0.001841 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000054 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000086 | Grad Max: 0.000566 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000217 | Grad Max: 0.000945 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001604 | Grad Max: 0.001604 [GRADIENT NORM TOTAL] 1.8266 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.588 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6811671 0.31883287] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.053 [MASKS] A(Pass/Fail): 695/1353 | B: 568/1480 | C: 297/1751 [LOSS Ex1] A: 0.65288 | B: 0.65068 | C: 0.64673 [LOGITS Ex2 A] Mean Abs: 1.861 | Max: 5.963 [LOSS Ex2] A: 0.16112 | B: 0.39143 | C: 0.29886 ** [JOINT LOSS] ** : 0.933894 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.040965 -> Layer: shared_layers.0.bias | Grad Mean: 0.050697 | Grad Max: 0.203648 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.007236 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010883 | Grad Max: 0.010883 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000451 | Grad Max: 0.103377 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007579 | Grad Max: 0.580788 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.002616 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001428 | Grad Max: 0.013343 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000186 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000249 | Grad Max: 0.001585 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000061 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000066 | Grad Max: 0.000347 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000161 | Grad Max: 0.000727 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000005 | Grad Max: 0.000005 [GRADIENT NORM TOTAL] 1.3744 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.660 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005068 0.4994932] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.052 [MASKS] A(Pass/Fail): 685/1363 | B: 573/1475 | C: 299/1749 [LOSS Ex1] A: 0.65823 | B: 0.65141 | C: 0.64607 [LOGITS Ex2 A] Mean Abs: 1.845 | Max: 5.734 [LOSS Ex2] A: 0.15611 | B: 0.38364 | C: 0.29217 ** [JOINT LOSS] ** : 0.929206 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003615 | Grad Max: 0.151979 -> Layer: shared_layers.0.bias | Grad Mean: 0.072190 | Grad Max: 0.334765 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.005404 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001131 | Grad Max: 0.001131 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000680 | Grad Max: 0.069922 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011271 | Grad Max: 0.385466 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000092 | Grad Max: 0.004539 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004500 | Grad Max: 0.030901 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000223 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001108 | Grad Max: 0.003268 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000101 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000333 | Grad Max: 0.000972 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000492 | Grad Max: 0.001358 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008522 | Grad Max: 0.008522 [GRADIENT NORM TOTAL] 1.7060 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.441 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6470224 0.35297763] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.052 [MASKS] A(Pass/Fail): 668/1380 | B: 576/1472 | C: 279/1769 [LOSS Ex1] A: 0.65504 | B: 0.64757 | C: 0.64852 [LOGITS Ex2 A] Mean Abs: 1.855 | Max: 5.738 [LOSS Ex2] A: 0.17752 | B: 0.36145 | C: 0.27253 ** [JOINT LOSS] ** : 0.920875 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004083 | Grad Max: 0.158371 -> Layer: shared_layers.0.bias | Grad Mean: 0.173027 | Grad Max: 0.697464 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.005970 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004082 | Grad Max: 0.004082 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001232 | Grad Max: 0.153926 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021975 | Grad Max: 0.860011 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000186 | Grad Max: 0.006756 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009860 | Grad Max: 0.054284 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000312 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002210 | Grad Max: 0.004962 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000136 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000626 | Grad Max: 0.001524 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000727 | Grad Max: 0.002163 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014285 | Grad Max: 0.014285 [GRADIENT NORM TOTAL] 3.6321 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.518 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59265697 0.40734306] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.053 [MASKS] A(Pass/Fail): 554/1062 | B: 531/1325 | C: 282/1766 [LOSS Ex1] A: 0.65355 | B: 0.65127 | C: 0.64774 [LOGITS Ex2 A] Mean Abs: 1.911 | Max: 6.442 [LOSS Ex2] A: 0.15466 | B: 0.36337 | C: 0.26729 ** [JOINT LOSS] ** : 0.912627 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003322 | Grad Max: 0.109196 -> Layer: shared_layers.0.bias | Grad Mean: 0.077544 | Grad Max: 0.287559 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.006465 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000565 | Grad Max: 0.000565 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000654 | Grad Max: 0.101660 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011216 | Grad Max: 0.560014 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.002733 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003481 | Grad Max: 0.020482 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000181 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000812 | Grad Max: 0.002764 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000081 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000233 | Grad Max: 0.000794 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000422 | Grad Max: 0.001453 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004993 | Grad Max: 0.004993 [GRADIENT NORM TOTAL] 1.7912 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.662 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50695986 0.49304014] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.544 | Std: 0.053 [MASKS] A(Pass/Fail): 681/1367 | B: 568/1480 | C: 301/1747 [LOSS Ex1] A: 0.65394 | B: 0.65040 | C: 0.64618 [LOGITS Ex2 A] Mean Abs: 1.852 | Max: 5.836 [LOSS Ex2] A: 0.16092 | B: 0.40089 | C: 0.31621 ** [JOINT LOSS] ** : 0.942846 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002926 | Grad Max: 0.087344 -> Layer: shared_layers.0.bias | Grad Mean: 0.213648 | Grad Max: 1.023857 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.006235 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001747 | Grad Max: 0.001747 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001467 | Grad Max: 0.242605 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026443 | Grad Max: 1.355647 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000225 | Grad Max: 0.008899 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012123 | Grad Max: 0.068545 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000345 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002509 | Grad Max: 0.005537 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000151 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000727 | Grad Max: 0.001805 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000901 | Grad Max: 0.001806 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017392 | Grad Max: 0.017392 [GRADIENT NORM TOTAL] 4.6766 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.615 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104789 0.48952106] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.544 | Std: 0.053 [MASKS] A(Pass/Fail): 668/1380 | B: 573/1475 | C: 319/1729 [LOSS Ex1] A: 0.65160 | B: 0.65113 | C: 0.64264 [LOGITS Ex2 A] Mean Abs: 1.867 | Max: 6.049 [LOSS Ex2] A: 0.15968 | B: 0.38782 | C: 0.29235 ** [JOINT LOSS] ** : 0.928407 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002776 | Grad Max: 0.084195 -> Layer: shared_layers.0.bias | Grad Mean: 0.215889 | Grad Max: 0.993085 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002243 | Grad Max: 0.006540 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000347 | Grad Max: 0.000347 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001464 | Grad Max: 0.212727 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027338 | Grad Max: 1.195763 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000246 | Grad Max: 0.009312 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013381 | Grad Max: 0.079547 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000361 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002871 | Grad Max: 0.006406 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000169 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000831 | Grad Max: 0.002068 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001006 | Grad Max: 0.001906 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019339 | Grad Max: 0.019339 [GRADIENT NORM TOTAL] 4.6895 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.635 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5024326 0.49756747] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.544 | Std: 0.053 [MASKS] A(Pass/Fail): 668/1380 | B: 577/1471 | C: 297/1751 [LOSS Ex1] A: 0.65058 | B: 0.64728 | C: 0.64591 [LOGITS Ex2 A] Mean Abs: 1.874 | Max: 6.684 [LOSS Ex2] A: 0.18199 | B: 0.35464 | C: 0.30848 ** [JOINT LOSS] ** : 0.929626 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005041 | Grad Max: 0.156248 -> Layer: shared_layers.0.bias | Grad Mean: 0.269874 | Grad Max: 0.924642 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002327 | Grad Max: 0.007141 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001807 | Grad Max: 0.001807 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001759 | Grad Max: 0.191732 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031743 | Grad Max: 1.083912 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.009703 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015406 | Grad Max: 0.080803 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000456 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003423 | Grad Max: 0.008102 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000199 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000956 | Grad Max: 0.002358 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001003 | Grad Max: 0.002218 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019616 | Grad Max: 0.019616 [GRADIENT NORM TOTAL] 5.1261 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.527 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50386965 0.49613038] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.542 | Std: 0.052 [MASKS] A(Pass/Fail): 647/1401 | B: 532/1324 | C: 309/1739 [LOSS Ex1] A: 0.65743 | B: 0.65100 | C: 0.64718 [LOGITS Ex2 A] Mean Abs: 1.850 | Max: 5.752 [LOSS Ex2] A: 0.15278 | B: 0.36034 | C: 0.26409 ** [JOINT LOSS] ** : 0.910937 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002499 | Grad Max: 0.066421 -> Layer: shared_layers.0.bias | Grad Mean: 0.175316 | Grad Max: 0.692659 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.005339 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002435 | Grad Max: 0.002435 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001158 | Grad Max: 0.202857 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021048 | Grad Max: 1.134658 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.006109 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009008 | Grad Max: 0.046128 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000345 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001941 | Grad Max: 0.004658 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000123 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000561 | Grad Max: 0.001473 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000628 | Grad Max: 0.002384 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012907 | Grad Max: 0.012907 [GRADIENT NORM TOTAL] 3.7130 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.094 | Max: 0.458 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53754354 0.4624564 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.051 [MASKS] A(Pass/Fail): 640/1408 | B: 570/1478 | C: 313/1735 [LOSS Ex1] A: 0.65720 | B: 0.65014 | C: 0.64544 [LOGITS Ex2 A] Mean Abs: 1.802 | Max: 6.527 [LOSS Ex2] A: 0.16202 | B: 0.39034 | C: 0.27333 ** [JOINT LOSS] ** : 0.926156 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004074 | Grad Max: 0.111585 -> Layer: shared_layers.0.bias | Grad Mean: 0.201297 | Grad Max: 0.827602 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.006180 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005935 | Grad Max: 0.005935 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001321 | Grad Max: 0.226707 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024192 | Grad Max: 1.279555 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000212 | Grad Max: 0.006656 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011379 | Grad Max: 0.053713 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000407 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002502 | Grad Max: 0.005803 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000159 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000714 | Grad Max: 0.001814 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000862 | Grad Max: 0.002005 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015952 | Grad Max: 0.015952 [GRADIENT NORM TOTAL] 4.1988 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.594 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6836789 0.3163211] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.053 [MASKS] A(Pass/Fail): 697/1351 | B: 573/1475 | C: 221/1155 [LOSS Ex1] A: 0.65244 | B: 0.65088 | C: 0.64490 [LOGITS Ex2 A] Mean Abs: 1.870 | Max: 6.060 [LOSS Ex2] A: 0.14830 | B: 0.37986 | C: 0.29265 ** [JOINT LOSS] ** : 0.923013 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003143 | Grad Max: 0.081301 -> Layer: shared_layers.0.bias | Grad Mean: 0.094740 | Grad Max: 0.370645 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006419 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001831 | Grad Max: 0.001831 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000713 | Grad Max: 0.127491 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012843 | Grad Max: 0.697899 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000110 | Grad Max: 0.005871 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005859 | Grad Max: 0.042330 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000319 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001299 | Grad Max: 0.004187 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000103 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000374 | Grad Max: 0.001105 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000495 | Grad Max: 0.001610 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008682 | Grad Max: 0.008682 [GRADIENT NORM TOTAL] 2.0956 [EPOCH SUMMARY] Train Loss: 0.9254 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9099 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 94/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.666 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50059867 0.49940136] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.052 [MASKS] A(Pass/Fail): 686/1362 | B: 577/1471 | C: 325/1723 [LOSS Ex1] A: 0.65786 | B: 0.64703 | C: 0.64720 [LOGITS Ex2 A] Mean Abs: 1.897 | Max: 5.824 [LOSS Ex2] A: 0.15172 | B: 0.37418 | C: 0.31330 ** [JOINT LOSS] ** : 0.930429 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003410 | Grad Max: 0.167050 -> Layer: shared_layers.0.bias | Grad Mean: 0.393950 | Grad Max: 2.036755 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.006072 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007418 | Grad Max: 0.007418 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002502 | Grad Max: 0.333904 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046451 | Grad Max: 1.866511 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000422 | Grad Max: 0.016837 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023402 | Grad Max: 0.130004 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000621 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004909 | Grad Max: 0.011024 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000268 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001383 | Grad Max: 0.003242 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001446 | Grad Max: 0.002972 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030060 | Grad Max: 0.030060 [GRADIENT NORM TOTAL] 8.2414 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.447 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64879954 0.35120043] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.053 [MASKS] A(Pass/Fail): 669/1379 | B: 532/1324 | C: 319/1729 [LOSS Ex1] A: 0.65464 | B: 0.65077 | C: 0.64446 [LOGITS Ex2 A] Mean Abs: 1.890 | Max: 6.228 [LOSS Ex2] A: 0.17786 | B: 0.35962 | C: 0.27432 ** [JOINT LOSS] ** : 0.920552 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003436 | Grad Max: 0.089997 -> Layer: shared_layers.0.bias | Grad Mean: 0.259491 | Grad Max: 1.116624 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.006681 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003235 | Grad Max: 0.003235 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001770 | Grad Max: 0.257933 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032716 | Grad Max: 1.437705 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000280 | Grad Max: 0.010635 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015478 | Grad Max: 0.094235 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000416 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003382 | Grad Max: 0.007164 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000176 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000971 | Grad Max: 0.002216 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001094 | Grad Max: 0.002542 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021927 | Grad Max: 0.021927 [GRADIENT NORM TOTAL] 5.6039 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.524 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5937136 0.4062864] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.053 [MASKS] A(Pass/Fail): 558/1058 | B: 570/1478 | C: 353/1695 [LOSS Ex1] A: 0.65315 | B: 0.64990 | C: 0.64035 [LOGITS Ex2 A] Mean Abs: 1.920 | Max: 5.690 [LOSS Ex2] A: 0.15676 | B: 0.38217 | C: 0.27585 ** [JOINT LOSS] ** : 0.919392 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001834 | Grad Max: 0.054574 -> Layer: shared_layers.0.bias | Grad Mean: 0.110690 | Grad Max: 0.508863 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.006937 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005334 | Grad Max: 0.005334 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000764 | Grad Max: 0.177208 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013670 | Grad Max: 0.996782 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000099 | Grad Max: 0.004160 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005135 | Grad Max: 0.030032 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000261 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001095 | Grad Max: 0.003512 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000091 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000312 | Grad Max: 0.000905 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000411 | Grad Max: 0.001270 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007238 | Grad Max: 0.007238 [GRADIENT NORM TOTAL] 2.5197 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.668 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.506917 0.493083] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.545 | Std: 0.054 [MASKS] A(Pass/Fail): 683/1365 | B: 573/1475 | C: 314/1734 [LOSS Ex1] A: 0.65355 | B: 0.65065 | C: 0.64590 [LOGITS Ex2 A] Mean Abs: 1.879 | Max: 6.400 [LOSS Ex2] A: 0.15356 | B: 0.38044 | C: 0.28284 ** [JOINT LOSS] ** : 0.922319 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.070122 -> Layer: shared_layers.0.bias | Grad Mean: 0.128331 | Grad Max: 0.617764 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.006336 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004969 | Grad Max: 0.004969 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000793 | Grad Max: 0.337195 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014149 | Grad Max: 1.891307 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.003927 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005312 | Grad Max: 0.027226 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000193 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001102 | Grad Max: 0.003435 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000107 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000293 | Grad Max: 0.001132 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000404 | Grad Max: 0.001517 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006167 | Grad Max: 0.006167 [GRADIENT NORM TOTAL] 3.3374 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.622 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105466 0.48945338] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.545 | Std: 0.053 [MASKS] A(Pass/Fail): 669/1379 | B: 578/1470 | C: 323/1725 [LOSS Ex1] A: 0.65120 | B: 0.64679 | C: 0.64488 [LOGITS Ex2 A] Mean Abs: 1.902 | Max: 6.255 [LOSS Ex2] A: 0.15565 | B: 0.36219 | C: 0.28684 ** [JOINT LOSS] ** : 0.915851 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003191 | Grad Max: 0.078905 -> Layer: shared_layers.0.bias | Grad Mean: 0.103836 | Grad Max: 0.524116 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002281 | Grad Max: 0.006705 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003866 | Grad Max: 0.003866 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000796 | Grad Max: 0.128161 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014482 | Grad Max: 0.713685 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000123 | Grad Max: 0.004999 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006524 | Grad Max: 0.034178 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000287 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001456 | Grad Max: 0.003885 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000113 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000416 | Grad Max: 0.001111 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000519 | Grad Max: 0.001776 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009631 | Grad Max: 0.009631 [GRADIENT NORM TOTAL] 2.3324 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.642 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50243807 0.49756187] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.544 | Std: 0.053 [MASKS] A(Pass/Fail): 670/1378 | B: 533/1323 | C: 320/1728 [LOSS Ex1] A: 0.65019 | B: 0.65054 | C: 0.64553 [LOGITS Ex2 A] Mean Abs: 1.867 | Max: 7.368 [LOSS Ex2] A: 0.16788 | B: 0.36542 | C: 0.28528 ** [JOINT LOSS] ** : 0.921614 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003068 | Grad Max: 0.058330 -> Layer: shared_layers.0.bias | Grad Mean: 0.172327 | Grad Max: 0.809718 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002228 | Grad Max: 0.007050 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002179 | Grad Max: 0.002179 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001099 | Grad Max: 0.096728 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019363 | Grad Max: 0.531295 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.006588 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009865 | Grad Max: 0.050871 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000330 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001998 | Grad Max: 0.005191 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000150 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000521 | Grad Max: 0.001587 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000483 | Grad Max: 0.001482 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009483 | Grad Max: 0.009483 [GRADIENT NORM TOTAL] 3.1990 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.533 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037916 0.4962085] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.052 [MASKS] A(Pass/Fail): 648/1400 | B: 571/1477 | C: 311/1737 [LOSS Ex1] A: 0.65709 | B: 0.64966 | C: 0.64599 [LOGITS Ex2 A] Mean Abs: 1.850 | Max: 6.228 [LOSS Ex2] A: 0.14682 | B: 0.38881 | C: 0.26720 ** [JOINT LOSS] ** : 0.918527 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001889 | Grad Max: 0.052539 -> Layer: shared_layers.0.bias | Grad Mean: 0.026977 | Grad Max: 0.146009 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.005680 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004096 | Grad Max: 0.004096 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000339 | Grad Max: 0.061791 -> Layer: exit2_layers.0.bias | Grad Mean: 0.005455 | Grad Max: 0.323886 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.002344 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001524 | Grad Max: 0.014093 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000150 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000263 | Grad Max: 0.002147 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000065 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000074 | Grad Max: 0.000595 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000444 | Grad Max: 0.001217 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001130 | Grad Max: 0.001130 [GRADIENT NORM TOTAL] 0.9134 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.464 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53784883 0.46215117] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.051 [MASKS] A(Pass/Fail): 641/1407 | B: 573/1475 | C: 336/1712 [LOSS Ex1] A: 0.65687 | B: 0.65041 | C: 0.64451 [LOGITS Ex2 A] Mean Abs: 1.861 | Max: 6.131 [LOSS Ex2] A: 0.16965 | B: 0.38139 | C: 0.28696 ** [JOINT LOSS] ** : 0.929929 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002282 | Grad Max: 0.103971 -> Layer: shared_layers.0.bias | Grad Mean: 0.262384 | Grad Max: 1.264064 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.007002 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010511 | Grad Max: 0.010511 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001661 | Grad Max: 0.259986 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030811 | Grad Max: 1.461378 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000249 | Grad Max: 0.010144 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013801 | Grad Max: 0.085307 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000360 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002946 | Grad Max: 0.006465 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000189 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000817 | Grad Max: 0.002019 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000790 | Grad Max: 0.001984 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016742 | Grad Max: 0.016742 [GRADIENT NORM TOTAL] 5.5673 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.600 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6857831 0.31421688] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.054 [MASKS] A(Pass/Fail): 697/1351 | B: 580/1468 | C: 324/1724 [LOSS Ex1] A: 0.65206 | B: 0.64653 | C: 0.64329 [LOGITS Ex2 A] Mean Abs: 1.899 | Max: 6.401 [LOSS Ex2] A: 0.15110 | B: 0.36279 | C: 0.28708 ** [JOINT LOSS] ** : 0.914280 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002026 | Grad Max: 0.051933 -> Layer: shared_layers.0.bias | Grad Mean: 0.162536 | Grad Max: 0.623986 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002304 | Grad Max: 0.006682 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005554 | Grad Max: 0.005554 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001097 | Grad Max: 0.150931 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019725 | Grad Max: 0.850115 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000150 | Grad Max: 0.006678 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008262 | Grad Max: 0.051616 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000277 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001790 | Grad Max: 0.004879 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000117 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000511 | Grad Max: 0.001196 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000566 | Grad Max: 0.001719 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011598 | Grad Max: 0.011598 [GRADIENT NORM TOTAL] 3.4376 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.673 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006405 0.4993595] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.053 [MASKS] A(Pass/Fail): 687/1361 | B: 533/1323 | C: 314/1734 [LOSS Ex1] A: 0.65751 | B: 0.65028 | C: 0.64631 [LOGITS Ex2 A] Mean Abs: 1.867 | Max: 6.047 [LOSS Ex2] A: 0.14992 | B: 0.36528 | C: 0.29211 ** [JOINT LOSS] ** : 0.920470 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008375 | Grad Max: 0.266832 -> Layer: shared_layers.0.bias | Grad Mean: 0.334491 | Grad Max: 1.378706 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.005345 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001220 | Grad Max: 0.001220 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002320 | Grad Max: 0.235388 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042505 | Grad Max: 1.324685 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.011834 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021186 | Grad Max: 0.102795 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000512 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004696 | Grad Max: 0.009471 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000267 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001317 | Grad Max: 0.003041 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001625 | Grad Max: 0.003020 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029206 | Grad Max: 0.029206 [GRADIENT NORM TOTAL] 6.4413 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.453 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6503997 0.3496003] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.054 [MASKS] A(Pass/Fail): 671/1377 | B: 571/1477 | C: 333/1715 [LOSS Ex1] A: 0.65426 | B: 0.64941 | C: 0.64339 [LOGITS Ex2 A] Mean Abs: 1.872 | Max: 6.735 [LOSS Ex2] A: 0.16564 | B: 0.39060 | C: 0.27553 ** [JOINT LOSS] ** : 0.926278 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007627 | Grad Max: 0.233659 -> Layer: shared_layers.0.bias | Grad Mean: 0.362044 | Grad Max: 1.526210 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.006181 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001175 | Grad Max: 0.001175 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002438 | Grad Max: 0.288463 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045063 | Grad Max: 1.491289 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000397 | Grad Max: 0.012462 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021695 | Grad Max: 0.109409 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000603 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004776 | Grad Max: 0.009780 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000272 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001334 | Grad Max: 0.003046 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001525 | Grad Max: 0.002748 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029012 | Grad Max: 0.029012 [GRADIENT NORM TOTAL] 7.1553 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.531 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59464306 0.4053569 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.054 [MASKS] A(Pass/Fail): 558/1058 | B: 573/1475 | C: 313/1735 [LOSS Ex1] A: 0.65277 | B: 0.65017 | C: 0.64415 [LOGITS Ex2 A] Mean Abs: 1.941 | Max: 6.261 [LOSS Ex2] A: 0.16122 | B: 0.37455 | C: 0.27608 ** [JOINT LOSS] ** : 0.919643 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.062991 -> Layer: shared_layers.0.bias | Grad Mean: 0.084707 | Grad Max: 0.471138 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.006739 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007973 | Grad Max: 0.007973 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000708 | Grad Max: 0.168721 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012191 | Grad Max: 0.945031 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000066 | Grad Max: 0.004050 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003209 | Grad Max: 0.027569 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000172 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000647 | Grad Max: 0.003064 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000101 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000168 | Grad Max: 0.000874 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000277 | Grad Max: 0.000875 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001732 | Grad Max: 0.001732 [GRADIENT NORM TOTAL] 2.5010 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.675 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50698155 0.49301842] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.545 | Std: 0.054 [MASKS] A(Pass/Fail): 684/1364 | B: 581/1467 | C: 321/1727 [LOSS Ex1] A: 0.65318 | B: 0.64628 | C: 0.64362 [LOGITS Ex2 A] Mean Abs: 1.947 | Max: 6.324 [LOSS Ex2] A: 0.15795 | B: 0.35944 | C: 0.32313 ** [JOINT LOSS] ** : 0.927866 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005019 | Grad Max: 0.136536 -> Layer: shared_layers.0.bias | Grad Mean: 0.366330 | Grad Max: 1.706127 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006122 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001594 | Grad Max: 0.001594 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002388 | Grad Max: 0.270911 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044035 | Grad Max: 1.509097 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000411 | Grad Max: 0.014969 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022675 | Grad Max: 0.122187 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000620 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004940 | Grad Max: 0.010771 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000288 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001368 | Grad Max: 0.003448 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001465 | Grad Max: 0.002796 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028841 | Grad Max: 0.028841 [GRADIENT NORM TOTAL] 7.3839 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.629 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105151 0.48948497] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.545 | Std: 0.054 [MASKS] A(Pass/Fail): 670/1378 | B: 533/1323 | C: 213/1163 [LOSS Ex1] A: 0.65080 | B: 0.65006 | C: 0.64507 [LOGITS Ex2 A] Mean Abs: 1.940 | Max: 6.253 [LOSS Ex2] A: 0.15934 | B: 0.35523 | C: 0.28940 ** [JOINT LOSS] ** : 0.916632 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004295 | Grad Max: 0.124322 -> Layer: shared_layers.0.bias | Grad Mean: 0.156461 | Grad Max: 0.629805 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.007087 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010112 | Grad Max: 0.010112 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001193 | Grad Max: 0.173119 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021104 | Grad Max: 0.955830 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000204 | Grad Max: 0.006280 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010819 | Grad Max: 0.049673 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000313 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002417 | Grad Max: 0.005562 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000168 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000683 | Grad Max: 0.001751 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000855 | Grad Max: 0.001999 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015580 | Grad Max: 0.015580 [GRADIENT NORM TOTAL] 3.2876 [EPOCH SUMMARY] Train Loss: 0.9217 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9083 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 95/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.649 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025354 0.4974646] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.545 | Std: 0.054 [MASKS] A(Pass/Fail): 671/1377 | B: 571/1477 | C: 331/1717 [LOSS Ex1] A: 0.64979 | B: 0.64919 | C: 0.64341 [LOGITS Ex2 A] Mean Abs: 1.866 | Max: 8.249 [LOSS Ex2] A: 0.16750 | B: 0.41410 | C: 0.26060 ** [JOINT LOSS] ** : 0.928197 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005639 | Grad Max: 0.160169 -> Layer: shared_layers.0.bias | Grad Mean: 0.527535 | Grad Max: 2.250470 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002300 | Grad Max: 0.007001 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004425 | Grad Max: 0.004425 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003193 | Grad Max: 0.546493 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059821 | Grad Max: 3.085160 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000508 | Grad Max: 0.018525 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028232 | Grad Max: 0.151461 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000792 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005967 | Grad Max: 0.012679 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000336 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001649 | Grad Max: 0.003923 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001774 | Grad Max: 0.003227 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035210 | Grad Max: 0.035210 [GRADIENT NORM TOTAL] 10.6143 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.538 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50380105 0.49619892] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.053 [MASKS] A(Pass/Fail): 648/1400 | B: 573/1475 | C: 298/1750 [LOSS Ex1] A: 0.65674 | B: 0.64995 | C: 0.64724 [LOGITS Ex2 A] Mean Abs: 1.833 | Max: 5.960 [LOSS Ex2] A: 0.15232 | B: 0.41675 | C: 0.29010 ** [JOINT LOSS] ** : 0.937705 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006994 | Grad Max: 0.205721 -> Layer: shared_layers.0.bias | Grad Mean: 0.684926 | Grad Max: 2.771009 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002018 | Grad Max: 0.005480 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003252 | Grad Max: 0.003252 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004204 | Grad Max: 0.578660 -> Layer: exit2_layers.0.bias | Grad Mean: 0.079479 | Grad Max: 3.267402 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000707 | Grad Max: 0.022840 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039758 | Grad Max: 0.199027 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.000986 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008554 | Grad Max: 0.017102 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000452 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002381 | Grad Max: 0.005461 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002683 | Grad Max: 0.005058 -> Layer: exit2_layers.12.bias | Grad Mean: 0.051953 | Grad Max: 0.051953 [GRADIENT NORM TOTAL] 13.4287 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.095 | Max: 0.469 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5379829 0.46201715] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.542 | Std: 0.052 [MASKS] A(Pass/Fail): 642/1406 | B: 581/1467 | C: 297/1751 [LOSS Ex1] A: 0.65654 | B: 0.64607 | C: 0.64563 [LOGITS Ex2 A] Mean Abs: 1.820 | Max: 6.646 [LOSS Ex2] A: 0.16374 | B: 0.37791 | C: 0.27636 ** [JOINT LOSS] ** : 0.922084 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004241 | Grad Max: 0.132382 -> Layer: shared_layers.0.bias | Grad Mean: 0.462281 | Grad Max: 1.871760 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006087 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004991 | Grad Max: 0.004991 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002784 | Grad Max: 0.413776 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052175 | Grad Max: 2.327711 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000470 | Grad Max: 0.015399 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026451 | Grad Max: 0.133797 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000670 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005692 | Grad Max: 0.012215 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000298 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001564 | Grad Max: 0.003514 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001681 | Grad Max: 0.003240 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032810 | Grad Max: 0.032810 [GRADIENT NORM TOTAL] 9.0525 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.606 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6875627 0.31243733] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.054 [MASKS] A(Pass/Fail): 698/1350 | B: 534/1322 | C: 352/1696 [LOSS Ex1] A: 0.65173 | B: 0.64987 | C: 0.64473 [LOGITS Ex2 A] Mean Abs: 1.936 | Max: 6.048 [LOSS Ex2] A: 0.15329 | B: 0.35266 | C: 0.28110 ** [JOINT LOSS] ** : 0.911122 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005415 | Grad Max: 0.184112 -> Layer: shared_layers.0.bias | Grad Mean: 0.132979 | Grad Max: 0.494975 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.006469 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004833 | Grad Max: 0.004833 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001095 | Grad Max: 0.163435 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018904 | Grad Max: 0.930603 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000162 | Grad Max: 0.005367 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008530 | Grad Max: 0.037219 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000269 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001991 | Grad Max: 0.004718 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000143 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000564 | Grad Max: 0.001352 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000715 | Grad Max: 0.001856 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012701 | Grad Max: 0.012701 [GRADIENT NORM TOTAL] 3.0521 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.678 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005821 0.4994179] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.053 [MASKS] A(Pass/Fail): 688/1360 | B: 572/1476 | C: 376/1672 [LOSS Ex1] A: 0.65722 | B: 0.64900 | C: 0.63880 [LOGITS Ex2 A] Mean Abs: 1.954 | Max: 6.042 [LOSS Ex2] A: 0.15339 | B: 0.38495 | C: 0.25598 ** [JOINT LOSS] ** : 0.913115 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003221 | Grad Max: 0.089909 -> Layer: shared_layers.0.bias | Grad Mean: 0.250428 | Grad Max: 1.150944 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.005882 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003075 | Grad Max: 0.003075 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001575 | Grad Max: 0.200394 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029403 | Grad Max: 1.117517 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000266 | Grad Max: 0.009367 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014602 | Grad Max: 0.074623 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000383 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003027 | Grad Max: 0.006740 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000193 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000836 | Grad Max: 0.001982 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000851 | Grad Max: 0.002230 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017411 | Grad Max: 0.017411 [GRADIENT NORM TOTAL] 5.1473 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.457 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.65160936 0.34839067] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.054 [MASKS] A(Pass/Fail): 671/1377 | B: 574/1474 | C: 296/1752 [LOSS Ex1] A: 0.65395 | B: 0.64978 | C: 0.64636 [LOGITS Ex2 A] Mean Abs: 1.931 | Max: 6.034 [LOSS Ex2] A: 0.16987 | B: 0.38479 | C: 0.28297 ** [JOINT LOSS] ** : 0.929240 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003009 | Grad Max: 0.118189 -> Layer: shared_layers.0.bias | Grad Mean: 0.133539 | Grad Max: 0.617649 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002120 | Grad Max: 0.006054 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001187 | Grad Max: 0.001187 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001078 | Grad Max: 0.262906 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018350 | Grad Max: 1.484137 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.006227 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006360 | Grad Max: 0.049217 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000261 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001179 | Grad Max: 0.003702 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000333 | Grad Max: 0.001040 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000351 | Grad Max: 0.001092 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007700 | Grad Max: 0.007700 [GRADIENT NORM TOTAL] 3.3885 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.536 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5953082 0.40469185] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.054 [MASKS] A(Pass/Fail): 559/1057 | B: 583/1465 | C: 322/1726 [LOSS Ex1] A: 0.65245 | B: 0.64589 | C: 0.64397 [LOGITS Ex2 A] Mean Abs: 1.928 | Max: 5.968 [LOSS Ex2] A: 0.15797 | B: 0.37491 | C: 0.28946 ** [JOINT LOSS] ** : 0.921553 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007445 | Grad Max: 0.228778 -> Layer: shared_layers.0.bias | Grad Mean: 0.329196 | Grad Max: 1.321798 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002257 | Grad Max: 0.006255 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003977 | Grad Max: 0.003977 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002315 | Grad Max: 0.405232 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041794 | Grad Max: 2.265126 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000365 | Grad Max: 0.011840 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019769 | Grad Max: 0.096096 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000540 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004441 | Grad Max: 0.009828 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000265 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001244 | Grad Max: 0.002910 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001544 | Grad Max: 0.002883 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027837 | Grad Max: 0.027837 [GRADIENT NORM TOTAL] 6.8702 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.680 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070216 0.49297833] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.546 | Std: 0.055 [MASKS] A(Pass/Fail): 685/1363 | B: 535/1321 | C: 328/1720 [LOSS Ex1] A: 0.65288 | B: 0.64969 | C: 0.64532 [LOGITS Ex2 A] Mean Abs: 1.937 | Max: 6.563 [LOSS Ex2] A: 0.14920 | B: 0.36861 | C: 0.29089 ** [JOINT LOSS] ** : 0.918863 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006087 | Grad Max: 0.208961 -> Layer: shared_layers.0.bias | Grad Mean: 0.349724 | Grad Max: 1.349849 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006362 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003453 | Grad Max: 0.003453 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002334 | Grad Max: 0.288815 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042769 | Grad Max: 1.582353 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000396 | Grad Max: 0.012947 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021901 | Grad Max: 0.100235 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000560 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004789 | Grad Max: 0.009834 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000286 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001324 | Grad Max: 0.003021 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001461 | Grad Max: 0.002551 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027969 | Grad Max: 0.027969 [GRADIENT NORM TOTAL] 6.8330 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.634 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51052964 0.48947036] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.545 | Std: 0.054 [MASKS] A(Pass/Fail): 670/1378 | B: 573/1475 | C: 334/1714 [LOSS Ex1] A: 0.65049 | B: 0.64883 | C: 0.64394 [LOGITS Ex2 A] Mean Abs: 1.945 | Max: 6.028 [LOSS Ex2] A: 0.14719 | B: 0.38461 | C: 0.31787 ** [JOINT LOSS] ** : 0.930979 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002720 | Grad Max: 0.065669 -> Layer: shared_layers.0.bias | Grad Mean: 0.129176 | Grad Max: 0.698761 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002327 | Grad Max: 0.006949 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009562 | Grad Max: 0.009562 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000889 | Grad Max: 0.132039 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015749 | Grad Max: 0.745684 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.004849 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005998 | Grad Max: 0.035837 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000253 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001202 | Grad Max: 0.003638 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000108 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000311 | Grad Max: 0.001027 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000322 | Grad Max: 0.000920 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005527 | Grad Max: 0.005527 [GRADIENT NORM TOTAL] 2.7244 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.654 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025835 0.49741647] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.545 | Std: 0.054 [MASKS] A(Pass/Fail): 671/1377 | B: 575/1473 | C: 320/1728 [LOSS Ex1] A: 0.64949 | B: 0.64961 | C: 0.64292 [LOGITS Ex2 A] Mean Abs: 1.918 | Max: 6.990 [LOSS Ex2] A: 0.17340 | B: 0.38335 | C: 0.29780 ** [JOINT LOSS] ** : 0.932187 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003958 | Grad Max: 0.183119 -> Layer: shared_layers.0.bias | Grad Mean: 0.162812 | Grad Max: 0.674775 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002273 | Grad Max: 0.006429 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001032 | Grad Max: 0.001032 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001197 | Grad Max: 0.137288 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020990 | Grad Max: 0.766810 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.005503 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009319 | Grad Max: 0.041284 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000324 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002115 | Grad Max: 0.005359 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000153 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000571 | Grad Max: 0.001635 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000510 | Grad Max: 0.001315 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010000 | Grad Max: 0.010000 [GRADIENT NORM TOTAL] 3.2415 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.543 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037752 0.49622482] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.543 | Std: 0.053 [MASKS] A(Pass/Fail): 649/1399 | B: 583/1465 | C: 356/1692 [LOSS Ex1] A: 0.65648 | B: 0.64571 | C: 0.63951 [LOGITS Ex2 A] Mean Abs: 1.867 | Max: 6.660 [LOSS Ex2] A: 0.15149 | B: 0.36338 | C: 0.27657 ** [JOINT LOSS] ** : 0.911049 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002437 | Grad Max: 0.077779 -> Layer: shared_layers.0.bias | Grad Mean: 0.200605 | Grad Max: 1.069011 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.006346 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005502 | Grad Max: 0.005502 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001363 | Grad Max: 0.230979 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025235 | Grad Max: 1.299665 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000240 | Grad Max: 0.008697 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013531 | Grad Max: 0.075741 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000402 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002849 | Grad Max: 0.006474 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000178 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000780 | Grad Max: 0.001873 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000839 | Grad Max: 0.002153 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016541 | Grad Max: 0.016541 [GRADIENT NORM TOTAL] 4.2594 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.096 | Max: 0.473 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.538192 0.46180797] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.052 [MASKS] A(Pass/Fail): 642/1406 | B: 535/1321 | C: 334/1714 [LOSS Ex1] A: 0.65628 | B: 0.64952 | C: 0.64381 [LOGITS Ex2 A] Mean Abs: 1.848 | Max: 6.091 [LOSS Ex2] A: 0.16384 | B: 0.35375 | C: 0.27194 ** [JOINT LOSS] ** : 0.913046 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001728 | Grad Max: 0.028257 -> Layer: shared_layers.0.bias | Grad Mean: 0.107623 | Grad Max: 0.581931 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.005953 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002961 | Grad Max: 0.002961 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000809 | Grad Max: 0.156247 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014485 | Grad Max: 0.877415 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.004809 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006144 | Grad Max: 0.036561 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000231 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001248 | Grad Max: 0.004075 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000106 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000334 | Grad Max: 0.000967 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000405 | Grad Max: 0.001311 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007062 | Grad Max: 0.007062 [GRADIENT NORM TOTAL] 2.4131 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.611 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6891596 0.31084046] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.055 [MASKS] A(Pass/Fail): 698/1350 | B: 574/1474 | C: 324/1724 [LOSS Ex1] A: 0.65143 | B: 0.64864 | C: 0.64354 [LOGITS Ex2 A] Mean Abs: 1.922 | Max: 5.829 [LOSS Ex2] A: 0.16209 | B: 0.37690 | C: 0.29753 ** [JOINT LOSS] ** : 0.926711 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007499 | Grad Max: 0.249016 -> Layer: shared_layers.0.bias | Grad Mean: 0.422647 | Grad Max: 1.521842 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.006758 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005212 | Grad Max: 0.005212 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002786 | Grad Max: 0.265831 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051409 | Grad Max: 1.427521 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000462 | Grad Max: 0.014240 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025506 | Grad Max: 0.121124 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000636 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005595 | Grad Max: 0.011312 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000314 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001560 | Grad Max: 0.003646 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001684 | Grad Max: 0.003434 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032897 | Grad Max: 0.032897 [GRADIENT NORM TOTAL] 8.0389 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.684 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006233 0.4993767] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.054 [MASKS] A(Pass/Fail): 688/1360 | B: 575/1473 | C: 224/1152 [LOSS Ex1] A: 0.65695 | B: 0.64942 | C: 0.64461 [LOGITS Ex2 A] Mean Abs: 1.915 | Max: 5.933 [LOSS Ex2] A: 0.15008 | B: 0.37895 | C: 0.30964 ** [JOINT LOSS] ** : 0.929885 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003621 | Grad Max: 0.142012 -> Layer: shared_layers.0.bias | Grad Mean: 0.383318 | Grad Max: 1.897784 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002030 | Grad Max: 0.005862 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004593 | Grad Max: 0.004593 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002477 | Grad Max: 0.325905 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046312 | Grad Max: 1.836332 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000398 | Grad Max: 0.012600 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022290 | Grad Max: 0.113344 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000575 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004681 | Grad Max: 0.009953 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000279 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001274 | Grad Max: 0.003387 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001282 | Grad Max: 0.002633 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025329 | Grad Max: 0.025329 [GRADIENT NORM TOTAL] 8.2012 [EPOCH SUMMARY] Train Loss: 0.9233 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.9004 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9066 -> New: 0.9004) ############################## EPOCH 96/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.462 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.65282863 0.3471714 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.055 [MASKS] A(Pass/Fail): 672/1376 | B: 583/1465 | C: 343/1705 [LOSS Ex1] A: 0.65367 | B: 0.64552 | C: 0.64389 [LOGITS Ex2 A] Mean Abs: 1.881 | Max: 5.733 [LOSS Ex2] A: 0.15994 | B: 0.36298 | C: 0.29075 ** [JOINT LOSS] ** : 0.918913 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002945 | Grad Max: 0.067443 -> Layer: shared_layers.0.bias | Grad Mean: 0.074574 | Grad Max: 0.329009 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.006187 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000313 | Grad Max: 0.000313 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000600 | Grad Max: 0.293885 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010055 | Grad Max: 1.646561 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.003160 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002698 | Grad Max: 0.014748 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000149 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000626 | Grad Max: 0.002306 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000083 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000179 | Grad Max: 0.000676 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000307 | Grad Max: 0.001134 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004209 | Grad Max: 0.004209 [GRADIENT NORM TOTAL] 2.5083 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.540 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.59604824 0.40395176] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.055 [MASKS] A(Pass/Fail): 562/1054 | B: 535/1321 | C: 342/1706 [LOSS Ex1] A: 0.65216 | B: 0.64934 | C: 0.63989 [LOGITS Ex2 A] Mean Abs: 1.886 | Max: 6.294 [LOSS Ex2] A: 0.15949 | B: 0.36875 | C: 0.27000 ** [JOINT LOSS] ** : 0.913212 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006093 | Grad Max: 0.196240 -> Layer: shared_layers.0.bias | Grad Mean: 0.242952 | Grad Max: 0.941032 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002237 | Grad Max: 0.006643 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006643 | Grad Max: 0.006643 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001600 | Grad Max: 0.238397 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029146 | Grad Max: 1.251453 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000273 | Grad Max: 0.009354 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015024 | Grad Max: 0.079867 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000415 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003342 | Grad Max: 0.007372 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000935 | Grad Max: 0.002374 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001095 | Grad Max: 0.002520 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020438 | Grad Max: 0.020438 [GRADIENT NORM TOTAL] 4.6204 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.686 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070279 0.49297202] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.546 | Std: 0.055 [MASKS] A(Pass/Fail): 685/1363 | B: 575/1473 | C: 356/1692 [LOSS Ex1] A: 0.65260 | B: 0.64846 | C: 0.63924 [LOGITS Ex2 A] Mean Abs: 1.906 | Max: 7.816 [LOSS Ex2] A: 0.15956 | B: 0.37810 | C: 0.26926 ** [JOINT LOSS] ** : 0.915742 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002733 | Grad Max: 0.136429 -> Layer: shared_layers.0.bias | Grad Mean: 0.057560 | Grad Max: 0.367193 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002256 | Grad Max: 0.006503 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003286 | Grad Max: 0.003286 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000495 | Grad Max: 0.119085 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007818 | Grad Max: 0.670085 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.004862 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001632 | Grad Max: 0.027185 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000120 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000262 | Grad Max: 0.001494 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000050 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000073 | Grad Max: 0.000366 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000338 | Grad Max: 0.000905 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000600 | Grad Max: 0.000600 [GRADIENT NORM TOTAL] 1.5736 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.640 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105924 0.4894076] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.546 | Std: 0.055 [MASKS] A(Pass/Fail): 670/1378 | B: 575/1473 | C: 329/1719 [LOSS Ex1] A: 0.65018 | B: 0.64923 | C: 0.64482 [LOGITS Ex2 A] Mean Abs: 1.909 | Max: 5.736 [LOSS Ex2] A: 0.16057 | B: 0.36709 | C: 0.27339 ** [JOINT LOSS] ** : 0.915089 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003622 | Grad Max: 0.084126 -> Layer: shared_layers.0.bias | Grad Mean: 0.174515 | Grad Max: 0.733827 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006538 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001468 | Grad Max: 0.001468 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001241 | Grad Max: 0.181908 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022746 | Grad Max: 0.934671 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.007335 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010350 | Grad Max: 0.053956 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000319 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002292 | Grad Max: 0.005506 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000141 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000642 | Grad Max: 0.001551 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000701 | Grad Max: 0.002139 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014272 | Grad Max: 0.014272 [GRADIENT NORM TOTAL] 3.7610 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.661 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50259775 0.49740222] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.545 | Std: 0.055 [MASKS] A(Pass/Fail): 671/1377 | B: 583/1465 | C: 378/1670 [LOSS Ex1] A: 0.64918 | B: 0.64532 | C: 0.63543 [LOGITS Ex2 A] Mean Abs: 1.882 | Max: 6.664 [LOSS Ex2] A: 0.16793 | B: 0.36451 | C: 0.26455 ** [JOINT LOSS] ** : 0.908971 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003715 | Grad Max: 0.169901 -> Layer: shared_layers.0.bias | Grad Mean: 0.050235 | Grad Max: 0.239643 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002415 | Grad Max: 0.006811 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000352 | Grad Max: 0.000352 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000606 | Grad Max: 0.129338 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008918 | Grad Max: 0.679771 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.003103 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002597 | Grad Max: 0.016190 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000204 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000679 | Grad Max: 0.002507 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000091 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000218 | Grad Max: 0.000844 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000367 | Grad Max: 0.001400 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005810 | Grad Max: 0.005810 [GRADIENT NORM TOTAL] 1.6805 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.547 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037513 0.49624872] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.054 [MASKS] A(Pass/Fail): 650/1398 | B: 536/1320 | C: 330/1718 [LOSS Ex1] A: 0.65620 | B: 0.64913 | C: 0.64390 [LOGITS Ex2 A] Mean Abs: 1.841 | Max: 5.661 [LOSS Ex2] A: 0.15141 | B: 0.36407 | C: 0.28654 ** [JOINT LOSS] ** : 0.917084 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003607 | Grad Max: 0.103240 -> Layer: shared_layers.0.bias | Grad Mean: 0.289124 | Grad Max: 1.226943 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.005566 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000742 | Grad Max: 0.000742 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001844 | Grad Max: 0.247787 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034318 | Grad Max: 1.375772 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.010585 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017673 | Grad Max: 0.085735 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000447 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003812 | Grad Max: 0.007730 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000228 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001046 | Grad Max: 0.002671 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001096 | Grad Max: 0.002374 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021606 | Grad Max: 0.021606 [GRADIENT NORM TOTAL] 5.5797 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.478 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5384094 0.46159056] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.052 [MASKS] A(Pass/Fail): 643/1405 | B: 576/1472 | C: 361/1687 [LOSS Ex1] A: 0.65601 | B: 0.64825 | C: 0.63861 [LOGITS Ex2 A] Mean Abs: 1.829 | Max: 6.945 [LOSS Ex2] A: 0.16952 | B: 0.38656 | C: 0.27433 ** [JOINT LOSS] ** : 0.924426 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004697 | Grad Max: 0.157645 -> Layer: shared_layers.0.bias | Grad Mean: 0.291073 | Grad Max: 1.213680 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.006031 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004450 | Grad Max: 0.004450 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.402839 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035926 | Grad Max: 2.259087 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.010536 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016803 | Grad Max: 0.085138 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000535 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003721 | Grad Max: 0.008742 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000222 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001038 | Grad Max: 0.002488 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001122 | Grad Max: 0.002164 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022024 | Grad Max: 0.022024 [GRADIENT NORM TOTAL] 6.2144 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.616 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.69079465 0.30920535] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.055 [MASKS] A(Pass/Fail): 698/1350 | B: 575/1473 | C: 302/1746 [LOSS Ex1] A: 0.65113 | B: 0.64902 | C: 0.64908 [LOGITS Ex2 A] Mean Abs: 1.919 | Max: 6.143 [LOSS Ex2] A: 0.15079 | B: 0.37253 | C: 0.28534 ** [JOINT LOSS] ** : 0.919294 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004331 | Grad Max: 0.119956 -> Layer: shared_layers.0.bias | Grad Mean: 0.252348 | Grad Max: 1.267571 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006698 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008157 | Grad Max: 0.008157 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001721 | Grad Max: 0.211721 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031844 | Grad Max: 1.182483 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000267 | Grad Max: 0.010416 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014895 | Grad Max: 0.083638 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000392 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003222 | Grad Max: 0.007221 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000192 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000886 | Grad Max: 0.002200 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000960 | Grad Max: 0.002462 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018776 | Grad Max: 0.018776 [GRADIENT NORM TOTAL] 5.3808 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.690 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50059927 0.49940073] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.054 [MASKS] A(Pass/Fail): 688/1360 | B: 584/1464 | C: 318/1730 [LOSS Ex1] A: 0.65669 | B: 0.64509 | C: 0.64578 [LOGITS Ex2 A] Mean Abs: 1.922 | Max: 5.716 [LOSS Ex2] A: 0.14807 | B: 0.36259 | C: 0.30354 ** [JOINT LOSS] ** : 0.920586 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003847 | Grad Max: 0.132082 -> Layer: shared_layers.0.bias | Grad Mean: 0.339030 | Grad Max: 1.715714 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005547 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002098 | Grad Max: 0.002098 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.258118 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040583 | Grad Max: 1.444995 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000351 | Grad Max: 0.012949 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019636 | Grad Max: 0.105320 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000549 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004034 | Grad Max: 0.008768 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000234 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001096 | Grad Max: 0.002698 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001034 | Grad Max: 0.002699 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022039 | Grad Max: 0.022039 [GRADIENT NORM TOTAL] 7.1605 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.466 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6540166 0.3459834] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.055 [MASKS] A(Pass/Fail): 673/1375 | B: 536/1320 | C: 303/1745 [LOSS Ex1] A: 0.65338 | B: 0.64894 | C: 0.64840 [LOGITS Ex2 A] Mean Abs: 1.898 | Max: 5.937 [LOSS Ex2] A: 0.16241 | B: 0.35768 | C: 0.30479 ** [JOINT LOSS] ** : 0.925200 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004528 | Grad Max: 0.180752 -> Layer: shared_layers.0.bias | Grad Mean: 0.089789 | Grad Max: 0.389979 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.006179 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003382 | Grad Max: 0.003382 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000819 | Grad Max: 0.257296 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012554 | Grad Max: 1.440299 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.003395 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002423 | Grad Max: 0.019616 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000190 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000555 | Grad Max: 0.002466 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000097 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000152 | Grad Max: 0.000702 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000218 | Grad Max: 0.000931 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002480 | Grad Max: 0.002480 [GRADIENT NORM TOTAL] 2.6876 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.545 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.596688 0.4033121] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.055 [MASKS] A(Pass/Fail): 564/1052 | B: 576/1472 | C: 304/1744 [LOSS Ex1] A: 0.65186 | B: 0.64805 | C: 0.64409 [LOGITS Ex2 A] Mean Abs: 1.926 | Max: 7.010 [LOSS Ex2] A: 0.15149 | B: 0.38339 | C: 0.28864 ** [JOINT LOSS] ** : 0.922508 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003531 | Grad Max: 0.102368 -> Layer: shared_layers.0.bias | Grad Mean: 0.281718 | Grad Max: 1.088805 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.006336 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004806 | Grad Max: 0.004806 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001796 | Grad Max: 0.236063 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033131 | Grad Max: 1.324898 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000288 | Grad Max: 0.010876 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016093 | Grad Max: 0.084801 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000457 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003488 | Grad Max: 0.007454 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000196 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000989 | Grad Max: 0.002320 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001134 | Grad Max: 0.002166 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022470 | Grad Max: 0.022470 [GRADIENT NORM TOTAL] 5.7384 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.692 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50707453 0.49292544] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.546 | Std: 0.056 [MASKS] A(Pass/Fail): 685/1363 | B: 576/1472 | C: 304/1744 [LOSS Ex1] A: 0.65230 | B: 0.64882 | C: 0.64535 [LOGITS Ex2 A] Mean Abs: 1.923 | Max: 6.011 [LOSS Ex2] A: 0.14773 | B: 0.38069 | C: 0.26708 ** [JOINT LOSS] ** : 0.913988 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001979 | Grad Max: 0.055798 -> Layer: shared_layers.0.bias | Grad Mean: 0.046717 | Grad Max: 0.217461 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002143 | Grad Max: 0.005891 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001940 | Grad Max: 0.001940 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000502 | Grad Max: 0.141548 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008125 | Grad Max: 0.797990 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003054 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001950 | Grad Max: 0.017916 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000151 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000312 | Grad Max: 0.001824 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000063 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000083 | Grad Max: 0.000409 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000329 | Grad Max: 0.000944 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001386 | Grad Max: 0.001386 [GRADIENT NORM TOTAL] 1.5581 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.646 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105801 0.4894199] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.546 | Std: 0.055 [MASKS] A(Pass/Fail): 671/1377 | B: 584/1464 | C: 310/1738 [LOSS Ex1] A: 0.64986 | B: 0.64488 | C: 0.64556 [LOGITS Ex2 A] Mean Abs: 1.955 | Max: 6.326 [LOSS Ex2] A: 0.15885 | B: 0.35923 | C: 0.27840 ** [JOINT LOSS] ** : 0.912259 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005340 | Grad Max: 0.145862 -> Layer: shared_layers.0.bias | Grad Mean: 0.305627 | Grad Max: 1.438969 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.006788 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004484 | Grad Max: 0.004484 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.238422 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037281 | Grad Max: 1.265579 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.010728 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017488 | Grad Max: 0.086785 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000452 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003831 | Grad Max: 0.007888 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000224 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001068 | Grad Max: 0.002623 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001157 | Grad Max: 0.002566 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022778 | Grad Max: 0.022778 [GRADIENT NORM TOTAL] 6.1729 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.666 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50264376 0.49735624] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.546 | Std: 0.055 [MASKS] A(Pass/Fail): 672/1376 | B: 538/1318 | C: 243/1133 [LOSS Ex1] A: 0.64885 | B: 0.64872 | C: 0.63785 [LOGITS Ex2 A] Mean Abs: 1.913 | Max: 7.981 [LOSS Ex2] A: 0.16957 | B: 0.36069 | C: 0.30654 ** [JOINT LOSS] ** : 0.924074 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004444 | Grad Max: 0.174911 -> Layer: shared_layers.0.bias | Grad Mean: 0.172737 | Grad Max: 0.872728 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002356 | Grad Max: 0.006703 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004780 | Grad Max: 0.004780 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001243 | Grad Max: 0.149341 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021604 | Grad Max: 0.840314 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.006348 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009057 | Grad Max: 0.049678 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000366 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002066 | Grad Max: 0.005661 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000140 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000580 | Grad Max: 0.001472 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000697 | Grad Max: 0.001667 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013032 | Grad Max: 0.013032 [GRADIENT NORM TOTAL] 3.6014 [EPOCH SUMMARY] Train Loss: 0.9180 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8996 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.9004 -> New: 0.8996) ############################## EPOCH 97/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.552 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037284 0.49627158] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.054 [MASKS] A(Pass/Fail): 651/1397 | B: 576/1472 | C: 312/1736 [LOSS Ex1] A: 0.65593 | B: 0.64783 | C: 0.64488 [LOGITS Ex2 A] Mean Abs: 1.843 | Max: 6.628 [LOSS Ex2] A: 0.14347 | B: 0.38445 | C: 0.26389 ** [JOINT LOSS] ** : 0.913483 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003338 | Grad Max: 0.108143 -> Layer: shared_layers.0.bias | Grad Mean: 0.313096 | Grad Max: 1.486660 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.005695 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000098 | Grad Max: 0.000098 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.282588 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037907 | Grad Max: 1.564606 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000340 | Grad Max: 0.013057 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019375 | Grad Max: 0.108198 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000524 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004170 | Grad Max: 0.009141 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000263 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001146 | Grad Max: 0.002888 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001205 | Grad Max: 0.002852 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023631 | Grad Max: 0.023631 [GRADIENT NORM TOTAL] 6.4766 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.097 | Max: 0.482 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53869236 0.46130767] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.543 | Std: 0.053 [MASKS] A(Pass/Fail): 645/1403 | B: 576/1472 | C: 331/1717 [LOSS Ex1] A: 0.65575 | B: 0.64860 | C: 0.64116 [LOGITS Ex2 A] Mean Abs: 1.813 | Max: 5.848 [LOSS Ex2] A: 0.15663 | B: 0.38761 | C: 0.26605 ** [JOINT LOSS] ** : 0.918600 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004905 | Grad Max: 0.134786 -> Layer: shared_layers.0.bias | Grad Mean: 0.259163 | Grad Max: 1.258257 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005718 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002746 | Grad Max: 0.002746 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001747 | Grad Max: 0.233337 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032362 | Grad Max: 1.320776 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000303 | Grad Max: 0.011393 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016903 | Grad Max: 0.083843 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000428 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003664 | Grad Max: 0.007820 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000266 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001014 | Grad Max: 0.002725 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001128 | Grad Max: 0.002499 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021548 | Grad Max: 0.021548 [GRADIENT NORM TOTAL] 5.1499 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.621 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6924594 0.30754057] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.056 [MASKS] A(Pass/Fail): 699/1349 | B: 590/1458 | C: 375/1673 [LOSS Ex1] A: 0.65083 | B: 0.64467 | C: 0.64370 [LOGITS Ex2 A] Mean Abs: 1.915 | Max: 6.192 [LOSS Ex2] A: 0.14329 | B: 0.35403 | C: 0.29519 ** [JOINT LOSS] ** : 0.910570 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002754 | Grad Max: 0.067755 -> Layer: shared_layers.0.bias | Grad Mean: 0.204390 | Grad Max: 0.834073 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006190 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000409 | Grad Max: 0.000409 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001389 | Grad Max: 0.206644 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025266 | Grad Max: 1.161623 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000192 | Grad Max: 0.007871 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010880 | Grad Max: 0.058610 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000379 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002379 | Grad Max: 0.005593 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000140 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000652 | Grad Max: 0.001609 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000655 | Grad Max: 0.002062 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013389 | Grad Max: 0.013389 [GRADIENT NORM TOTAL] 4.4152 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.695 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006403 0.49935973] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.055 [MASKS] A(Pass/Fail): 690/1358 | B: 542/1314 | C: 316/1732 [LOSS Ex1] A: 0.65643 | B: 0.64853 | C: 0.64514 [LOGITS Ex2 A] Mean Abs: 1.905 | Max: 5.746 [LOSS Ex2] A: 0.15088 | B: 0.34604 | C: 0.27996 ** [JOINT LOSS] ** : 0.908993 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002455 | Grad Max: 0.048362 -> Layer: shared_layers.0.bias | Grad Mean: 0.148792 | Grad Max: 0.546292 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.005286 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001105 | Grad Max: 0.001105 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001031 | Grad Max: 0.184028 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018334 | Grad Max: 1.037607 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.006907 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008051 | Grad Max: 0.056742 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000266 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001673 | Grad Max: 0.004744 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000126 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000463 | Grad Max: 0.001309 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000507 | Grad Max: 0.001710 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009712 | Grad Max: 0.009712 [GRADIENT NORM TOTAL] 3.4489 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.470 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.65524715 0.3447528 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.055 [MASKS] A(Pass/Fail): 673/1375 | B: 578/1470 | C: 322/1726 [LOSS Ex1] A: 0.65310 | B: 0.64764 | C: 0.64370 [LOGITS Ex2 A] Mean Abs: 1.903 | Max: 6.059 [LOSS Ex2] A: 0.15961 | B: 0.37894 | C: 0.28266 ** [JOINT LOSS] ** : 0.921878 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003556 | Grad Max: 0.116319 -> Layer: shared_layers.0.bias | Grad Mean: 0.132386 | Grad Max: 0.602117 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.005982 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000394 | Grad Max: 0.000394 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000941 | Grad Max: 0.340815 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016561 | Grad Max: 1.915663 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000125 | Grad Max: 0.005806 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006545 | Grad Max: 0.034251 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000245 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001505 | Grad Max: 0.003795 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000123 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000425 | Grad Max: 0.001139 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000459 | Grad Max: 0.001263 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008844 | Grad Max: 0.008844 [GRADIENT NORM TOTAL] 3.4825 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.550 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5974096 0.40259036] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.056 [MASKS] A(Pass/Fail): 565/1051 | B: 580/1468 | C: 337/1711 [LOSS Ex1] A: 0.65157 | B: 0.64841 | C: 0.64200 [LOGITS Ex2 A] Mean Abs: 1.937 | Max: 6.831 [LOSS Ex2] A: 0.14212 | B: 0.38007 | C: 0.26246 ** [JOINT LOSS] ** : 0.908882 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001682 | Grad Max: 0.035577 -> Layer: shared_layers.0.bias | Grad Mean: 0.112690 | Grad Max: 0.483143 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.006164 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001428 | Grad Max: 0.001428 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000727 | Grad Max: 0.341326 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013179 | Grad Max: 1.915355 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000102 | Grad Max: 0.004672 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005768 | Grad Max: 0.035713 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000211 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001227 | Grad Max: 0.003428 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000107 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000354 | Grad Max: 0.001059 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000478 | Grad Max: 0.001501 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008358 | Grad Max: 0.008358 [GRADIENT NORM TOTAL] 3.0891 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.697 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070841 0.49291593] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.547 | Std: 0.056 [MASKS] A(Pass/Fail): 685/1363 | B: 591/1457 | C: 314/1734 [LOSS Ex1] A: 0.65201 | B: 0.64448 | C: 0.64433 [LOGITS Ex2 A] Mean Abs: 1.939 | Max: 8.111 [LOSS Ex2] A: 0.15284 | B: 0.35355 | C: 0.29670 ** [JOINT LOSS] ** : 0.914631 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003240 | Grad Max: 0.086818 -> Layer: shared_layers.0.bias | Grad Mean: 0.226650 | Grad Max: 1.141403 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.005634 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000620 | Grad Max: 0.000620 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001465 | Grad Max: 0.191515 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027064 | Grad Max: 1.087110 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000225 | Grad Max: 0.010703 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012718 | Grad Max: 0.090364 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000358 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002730 | Grad Max: 0.006400 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000209 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000745 | Grad Max: 0.002159 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000772 | Grad Max: 0.001980 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015598 | Grad Max: 0.015598 [GRADIENT NORM TOTAL] 4.7794 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.651 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51068187 0.48931813] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.546 | Std: 0.056 [MASKS] A(Pass/Fail): 673/1375 | B: 542/1314 | C: 338/1710 [LOSS Ex1] A: 0.64955 | B: 0.64834 | C: 0.64201 [LOGITS Ex2 A] Mean Abs: 1.910 | Max: 5.529 [LOSS Ex2] A: 0.15557 | B: 0.34976 | C: 0.27447 ** [JOINT LOSS] ** : 0.906569 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002984 | Grad Max: 0.078484 -> Layer: shared_layers.0.bias | Grad Mean: 0.076133 | Grad Max: 0.326454 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002290 | Grad Max: 0.007005 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007214 | Grad Max: 0.007214 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000618 | Grad Max: 0.131492 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010809 | Grad Max: 0.738301 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.003026 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003709 | Grad Max: 0.023413 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000197 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000843 | Grad Max: 0.002748 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000102 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000249 | Grad Max: 0.000885 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000392 | Grad Max: 0.001421 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006213 | Grad Max: 0.006213 [GRADIENT NORM TOTAL] 1.9943 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.672 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026653 0.4973347] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.546 | Std: 0.055 [MASKS] A(Pass/Fail): 672/1376 | B: 579/1469 | C: 313/1735 [LOSS Ex1] A: 0.64854 | B: 0.64744 | C: 0.64273 [LOGITS Ex2 A] Mean Abs: 1.880 | Max: 5.936 [LOSS Ex2] A: 0.16252 | B: 0.39427 | C: 0.28240 ** [JOINT LOSS] ** : 0.925967 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003463 | Grad Max: 0.093696 -> Layer: shared_layers.0.bias | Grad Mean: 0.294893 | Grad Max: 1.308844 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.006309 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000633 | Grad Max: 0.000633 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001831 | Grad Max: 0.240906 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033277 | Grad Max: 1.349881 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000288 | Grad Max: 0.010826 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016137 | Grad Max: 0.091073 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000440 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003300 | Grad Max: 0.007160 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000176 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000912 | Grad Max: 0.002170 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001007 | Grad Max: 0.001922 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020161 | Grad Max: 0.020161 [GRADIENT NORM TOTAL] 5.8132 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.557 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50367177 0.49632826] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.544 | Std: 0.055 [MASKS] A(Pass/Fail): 652/1396 | B: 580/1468 | C: 319/1729 [LOSS Ex1] A: 0.65566 | B: 0.64822 | C: 0.64255 [LOGITS Ex2 A] Mean Abs: 1.871 | Max: 6.925 [LOSS Ex2] A: 0.14036 | B: 0.38470 | C: 0.31055 ** [JOINT LOSS] ** : 0.927349 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003455 | Grad Max: 0.096556 -> Layer: shared_layers.0.bias | Grad Mean: 0.241048 | Grad Max: 0.919750 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.005822 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004793 | Grad Max: 0.004793 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001562 | Grad Max: 0.215744 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029009 | Grad Max: 1.212730 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000271 | Grad Max: 0.009615 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015311 | Grad Max: 0.081188 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000438 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003327 | Grad Max: 0.007461 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000185 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000933 | Grad Max: 0.002145 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001035 | Grad Max: 0.002076 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020617 | Grad Max: 0.020617 [GRADIENT NORM TOTAL] 4.6905 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.098 | Max: 0.486 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53897285 0.46102712] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.544 | Std: 0.053 [MASKS] A(Pass/Fail): 647/1401 | B: 591/1457 | C: 327/1721 [LOSS Ex1] A: 0.65549 | B: 0.64427 | C: 0.64376 [LOGITS Ex2 A] Mean Abs: 1.885 | Max: 6.381 [LOSS Ex2] A: 0.16153 | B: 0.35856 | C: 0.28576 ** [JOINT LOSS] ** : 0.916456 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002338 | Grad Max: 0.108712 -> Layer: shared_layers.0.bias | Grad Mean: 0.255197 | Grad Max: 1.290209 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.006532 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008063 | Grad Max: 0.008063 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001618 | Grad Max: 0.269686 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029784 | Grad Max: 1.502250 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000241 | Grad Max: 0.009678 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013812 | Grad Max: 0.078806 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000383 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002952 | Grad Max: 0.006779 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000169 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000818 | Grad Max: 0.001864 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000776 | Grad Max: 0.001918 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016756 | Grad Max: 0.016756 [GRADIENT NORM TOTAL] 5.5258 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.626 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6940681 0.30593193] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.056 [MASKS] A(Pass/Fail): 699/1349 | B: 542/1314 | C: 348/1700 [LOSS Ex1] A: 0.65054 | B: 0.64814 | C: 0.63919 [LOGITS Ex2 A] Mean Abs: 1.946 | Max: 7.389 [LOSS Ex2] A: 0.15253 | B: 0.35521 | C: 0.26642 ** [JOINT LOSS] ** : 0.904011 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002974 | Grad Max: 0.119486 -> Layer: shared_layers.0.bias | Grad Mean: 0.306689 | Grad Max: 1.467408 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.006942 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007313 | Grad Max: 0.007313 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001965 | Grad Max: 0.284114 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036690 | Grad Max: 1.586659 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.012681 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017982 | Grad Max: 0.104483 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000472 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003842 | Grad Max: 0.007903 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000194 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001079 | Grad Max: 0.002334 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001121 | Grad Max: 0.002731 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023216 | Grad Max: 0.023216 [GRADIENT NORM TOTAL] 6.4874 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.701 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006926 0.49930742] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.055 [MASKS] A(Pass/Fail): 690/1358 | B: 579/1469 | C: 336/1712 [LOSS Ex1] A: 0.65617 | B: 0.64725 | C: 0.64046 [LOGITS Ex2 A] Mean Abs: 1.930 | Max: 5.474 [LOSS Ex2] A: 0.14399 | B: 0.37892 | C: 0.28321 ** [JOINT LOSS] ** : 0.916664 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003037 | Grad Max: 0.107179 -> Layer: shared_layers.0.bias | Grad Mean: 0.063745 | Grad Max: 0.272212 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.005624 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000119 | Grad Max: 0.000119 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000620 | Grad Max: 0.112868 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009691 | Grad Max: 0.626685 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003098 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001794 | Grad Max: 0.015246 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000121 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000329 | Grad Max: 0.001788 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000061 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000095 | Grad Max: 0.000538 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000268 | Grad Max: 0.000946 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002527 | Grad Max: 0.002527 [GRADIENT NORM TOTAL] 1.9461 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.475 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6564448 0.3435552] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.056 [MASKS] A(Pass/Fail): 673/1375 | B: 581/1467 | C: 247/1129 [LOSS Ex1] A: 0.65282 | B: 0.64803 | C: 0.63509 [LOGITS Ex2 A] Mean Abs: 1.900 | Max: 6.110 [LOSS Ex2] A: 0.16380 | B: 0.38917 | C: 0.25571 ** [JOINT LOSS] ** : 0.914872 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004107 | Grad Max: 0.114661 -> Layer: shared_layers.0.bias | Grad Mean: 0.181688 | Grad Max: 0.850236 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.006217 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000080 | Grad Max: 0.000080 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001197 | Grad Max: 0.240624 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021565 | Grad Max: 1.372044 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.006928 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010342 | Grad Max: 0.054509 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000373 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002283 | Grad Max: 0.005553 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000139 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000633 | Grad Max: 0.001596 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000717 | Grad Max: 0.001600 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013728 | Grad Max: 0.013728 [GRADIENT NORM TOTAL] 3.8256 [EPOCH SUMMARY] Train Loss: 0.9149 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8952 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8996 -> New: 0.8952) ############################## EPOCH 98/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.555 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.598105 0.40189496] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.056 [MASKS] A(Pass/Fail): 565/1051 | B: 591/1457 | C: 328/1720 [LOSS Ex1] A: 0.65128 | B: 0.64407 | C: 0.64169 [LOGITS Ex2 A] Mean Abs: 1.961 | Max: 6.226 [LOSS Ex2] A: 0.14329 | B: 0.35356 | C: 0.26171 ** [JOINT LOSS] ** : 0.898536 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002602 | Grad Max: 0.104094 -> Layer: shared_layers.0.bias | Grad Mean: 0.097140 | Grad Max: 0.353158 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.006308 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001441 | Grad Max: 0.001441 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000591 | Grad Max: 0.331074 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009631 | Grad Max: 1.862821 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.002532 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001702 | Grad Max: 0.012283 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000134 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000328 | Grad Max: 0.001748 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000067 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000112 | Grad Max: 0.000528 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000494 | Grad Max: 0.001229 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002109 | Grad Max: 0.002109 [GRADIENT NORM TOTAL] 3.0726 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.703 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50706637 0.49293366] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.547 | Std: 0.056 [MASKS] A(Pass/Fail): 685/1363 | B: 545/1311 | C: 335/1713 [LOSS Ex1] A: 0.65172 | B: 0.64795 | C: 0.64212 [LOGITS Ex2 A] Mean Abs: 1.927 | Max: 7.170 [LOSS Ex2] A: 0.15623 | B: 0.35631 | C: 0.29368 ** [JOINT LOSS] ** : 0.916006 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004587 | Grad Max: 0.189319 -> Layer: shared_layers.0.bias | Grad Mean: 0.126948 | Grad Max: 0.482543 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.006643 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007212 | Grad Max: 0.007212 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000937 | Grad Max: 0.111236 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016235 | Grad Max: 0.618981 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.004574 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006298 | Grad Max: 0.030756 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000289 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001455 | Grad Max: 0.004277 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000134 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000412 | Grad Max: 0.001135 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000520 | Grad Max: 0.001565 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008910 | Grad Max: 0.008910 [GRADIENT NORM TOTAL] 2.6042 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.657 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.510758 0.48924205] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.547 | Std: 0.056 [MASKS] A(Pass/Fail): 674/1374 | B: 579/1469 | C: 348/1700 [LOSS Ex1] A: 0.64925 | B: 0.64705 | C: 0.64194 [LOGITS Ex2 A] Mean Abs: 1.927 | Max: 5.918 [LOSS Ex2] A: 0.14713 | B: 0.37986 | C: 0.29207 ** [JOINT LOSS] ** : 0.919098 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002482 | Grad Max: 0.051171 -> Layer: shared_layers.0.bias | Grad Mean: 0.108573 | Grad Max: 0.457602 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002317 | Grad Max: 0.007003 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007662 | Grad Max: 0.007662 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000764 | Grad Max: 0.144771 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012977 | Grad Max: 0.812322 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.004377 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004676 | Grad Max: 0.029701 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000236 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001002 | Grad Max: 0.003949 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000090 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000286 | Grad Max: 0.000811 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000363 | Grad Max: 0.001072 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006764 | Grad Max: 0.006764 [GRADIENT NORM TOTAL] 2.3478 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.678 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026678 0.49733222] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.546 | Std: 0.056 [MASKS] A(Pass/Fail): 673/1375 | B: 581/1467 | C: 341/1707 [LOSS Ex1] A: 0.64824 | B: 0.64782 | C: 0.64054 [LOGITS Ex2 A] Mean Abs: 1.909 | Max: 5.750 [LOSS Ex2] A: 0.15385 | B: 0.38169 | C: 0.26414 ** [JOINT LOSS] ** : 0.912091 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.077411 -> Layer: shared_layers.0.bias | Grad Mean: 0.098181 | Grad Max: 0.545955 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002244 | Grad Max: 0.006189 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001615 | Grad Max: 0.001615 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000659 | Grad Max: 0.183948 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010956 | Grad Max: 1.029823 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.004784 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002650 | Grad Max: 0.032812 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000191 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000378 | Grad Max: 0.002547 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000067 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000098 | Grad Max: 0.000575 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000302 | Grad Max: 0.000888 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001702 | Grad Max: 0.001702 [GRADIENT NORM TOTAL] 2.3820 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.561 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5035884 0.49641162] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.055 [MASKS] A(Pass/Fail): 652/1396 | B: 591/1457 | C: 358/1690 [LOSS Ex1] A: 0.65540 | B: 0.64385 | C: 0.63732 [LOGITS Ex2 A] Mean Abs: 1.923 | Max: 6.219 [LOSS Ex2] A: 0.14785 | B: 0.36394 | C: 0.26363 ** [JOINT LOSS] ** : 0.903993 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003787 | Grad Max: 0.121252 -> Layer: shared_layers.0.bias | Grad Mean: 0.236102 | Grad Max: 1.311202 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002240 | Grad Max: 0.005679 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002106 | Grad Max: 0.002106 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001526 | Grad Max: 0.197015 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027067 | Grad Max: 1.095376 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.008248 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012480 | Grad Max: 0.069291 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000361 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002544 | Grad Max: 0.005758 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000147 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000699 | Grad Max: 0.001664 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000693 | Grad Max: 0.002251 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015084 | Grad Max: 0.015084 [GRADIENT NORM TOTAL] 4.8406 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.491 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53916943 0.46083054] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.544 | Std: 0.054 [MASKS] A(Pass/Fail): 650/1398 | B: 546/1310 | C: 308/1740 [LOSS Ex1] A: 0.65523 | B: 0.64773 | C: 0.64364 [LOGITS Ex2 A] Mean Abs: 1.872 | Max: 7.241 [LOSS Ex2] A: 0.16472 | B: 0.34831 | C: 0.28146 ** [JOINT LOSS] ** : 0.913695 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004327 | Grad Max: 0.179777 -> Layer: shared_layers.0.bias | Grad Mean: 0.070393 | Grad Max: 0.337605 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005715 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005712 | Grad Max: 0.005712 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000779 | Grad Max: 0.255199 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012363 | Grad Max: 1.430524 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.002708 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002711 | Grad Max: 0.016723 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000201 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000674 | Grad Max: 0.002451 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000196 | Grad Max: 0.000727 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000346 | Grad Max: 0.001259 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004735 | Grad Max: 0.004735 [GRADIENT NORM TOTAL] 2.4282 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.632 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6958346 0.30416542] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.056 [MASKS] A(Pass/Fail): 701/1347 | B: 579/1469 | C: 326/1722 [LOSS Ex1] A: 0.65023 | B: 0.64682 | C: 0.64379 [LOGITS Ex2 A] Mean Abs: 1.927 | Max: 6.695 [LOSS Ex2] A: 0.14623 | B: 0.37875 | C: 0.27549 ** [JOINT LOSS] ** : 0.913772 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002444 | Grad Max: 0.079410 -> Layer: shared_layers.0.bias | Grad Mean: 0.076900 | Grad Max: 0.277646 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.006159 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002450 | Grad Max: 0.002450 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000596 | Grad Max: 0.112218 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009764 | Grad Max: 0.623468 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003508 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002020 | Grad Max: 0.022959 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000148 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000327 | Grad Max: 0.001831 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000058 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000526 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000277 | Grad Max: 0.000843 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001392 | Grad Max: 0.001392 [GRADIENT NORM TOTAL] 1.7739 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.707 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500752 0.499248] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.056 [MASKS] A(Pass/Fail): 691/1357 | B: 581/1467 | C: 328/1720 [LOSS Ex1] A: 0.65588 | B: 0.64759 | C: 0.64169 [LOGITS Ex2 A] Mean Abs: 1.910 | Max: 6.132 [LOSS Ex2] A: 0.14497 | B: 0.36450 | C: 0.26686 ** [JOINT LOSS] ** : 0.907168 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004114 | Grad Max: 0.133682 -> Layer: shared_layers.0.bias | Grad Mean: 0.140510 | Grad Max: 0.533981 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.005531 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003480 | Grad Max: 0.003480 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000997 | Grad Max: 0.170212 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017329 | Grad Max: 0.930708 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000152 | Grad Max: 0.005168 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008378 | Grad Max: 0.038218 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000362 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001929 | Grad Max: 0.005340 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000133 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000542 | Grad Max: 0.001336 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000688 | Grad Max: 0.001906 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012346 | Grad Max: 0.012346 [GRADIENT NORM TOTAL] 2.8018 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.480 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6578215 0.34217852] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.056 [MASKS] A(Pass/Fail): 674/1374 | B: 591/1457 | C: 324/1724 [LOSS Ex1] A: 0.65250 | B: 0.64362 | C: 0.64259 [LOGITS Ex2 A] Mean Abs: 1.931 | Max: 6.341 [LOSS Ex2] A: 0.16779 | B: 0.35156 | C: 0.29391 ** [JOINT LOSS] ** : 0.917324 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003797 | Grad Max: 0.152651 -> Layer: shared_layers.0.bias | Grad Mean: 0.164791 | Grad Max: 0.748913 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.006600 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007894 | Grad Max: 0.007894 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001183 | Grad Max: 0.172921 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021271 | Grad Max: 0.974825 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.006117 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009302 | Grad Max: 0.044900 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000307 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002043 | Grad Max: 0.005410 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000136 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000540 | Grad Max: 0.001585 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000493 | Grad Max: 0.001338 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009815 | Grad Max: 0.009815 [GRADIENT NORM TOTAL] 3.5879 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.561 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5988689 0.40113115] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.057 [MASKS] A(Pass/Fail): 565/1051 | B: 546/1310 | C: 331/1717 [LOSS Ex1] A: 0.65095 | B: 0.64751 | C: 0.64395 [LOGITS Ex2 A] Mean Abs: 1.980 | Max: 6.391 [LOSS Ex2] A: 0.15004 | B: 0.34948 | C: 0.26137 ** [JOINT LOSS] ** : 0.901102 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006353 | Grad Max: 0.209249 -> Layer: shared_layers.0.bias | Grad Mean: 0.126074 | Grad Max: 0.514332 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.006543 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000000 | Grad Max: 0.000000 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001102 | Grad Max: 0.138746 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019205 | Grad Max: 0.645856 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000148 | Grad Max: 0.004496 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007533 | Grad Max: 0.034574 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000264 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001755 | Grad Max: 0.004254 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000138 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000502 | Grad Max: 0.001316 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000624 | Grad Max: 0.001891 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011342 | Grad Max: 0.011342 [GRADIENT NORM TOTAL] 2.8226 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.709 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50702876 0.49297124] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.547 | Std: 0.057 [MASKS] A(Pass/Fail): 685/1363 | B: 581/1467 | C: 345/1703 [LOSS Ex1] A: 0.65138 | B: 0.64660 | C: 0.64096 [LOGITS Ex2 A] Mean Abs: 1.941 | Max: 5.787 [LOSS Ex2] A: 0.14805 | B: 0.39001 | C: 0.29980 ** [JOINT LOSS] ** : 0.925596 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003549 | Grad Max: 0.112017 -> Layer: shared_layers.0.bias | Grad Mean: 0.300744 | Grad Max: 1.526169 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002237 | Grad Max: 0.006227 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005485 | Grad Max: 0.005485 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001969 | Grad Max: 0.217558 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035242 | Grad Max: 1.240974 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000282 | Grad Max: 0.010580 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016063 | Grad Max: 0.095266 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000395 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003273 | Grad Max: 0.007241 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000169 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000892 | Grad Max: 0.002016 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000904 | Grad Max: 0.001951 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019491 | Grad Max: 0.019491 [GRADIENT NORM TOTAL] 6.1862 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.663 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5107572 0.4892428] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.547 | Std: 0.057 [MASKS] A(Pass/Fail): 676/1372 | B: 582/1466 | C: 349/1699 [LOSS Ex1] A: 0.64888 | B: 0.64738 | C: 0.63926 [LOGITS Ex2 A] Mean Abs: 1.922 | Max: 6.370 [LOSS Ex2] A: 0.14823 | B: 0.38854 | C: 0.29717 ** [JOINT LOSS] ** : 0.923152 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003125 | Grad Max: 0.155507 -> Layer: shared_layers.0.bias | Grad Mean: 0.402405 | Grad Max: 1.932973 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002200 | Grad Max: 0.006287 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002028 | Grad Max: 0.002028 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002500 | Grad Max: 0.262708 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046757 | Grad Max: 1.468260 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.014984 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023687 | Grad Max: 0.124427 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000602 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005001 | Grad Max: 0.010213 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000265 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001385 | Grad Max: 0.003135 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001469 | Grad Max: 0.002717 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029935 | Grad Max: 0.029935 [GRADIENT NORM TOTAL] 8.1719 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.685 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026428 0.4973572] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.547 | Std: 0.056 [MASKS] A(Pass/Fail): 673/1375 | B: 592/1456 | C: 323/1725 [LOSS Ex1] A: 0.64787 | B: 0.64340 | C: 0.64080 [LOGITS Ex2 A] Mean Abs: 1.924 | Max: 7.738 [LOSS Ex2] A: 0.15690 | B: 0.35529 | C: 0.26723 ** [JOINT LOSS] ** : 0.903825 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002987 | Grad Max: 0.126158 -> Layer: shared_layers.0.bias | Grad Mean: 0.051928 | Grad Max: 0.218182 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002346 | Grad Max: 0.006702 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001572 | Grad Max: 0.001572 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000579 | Grad Max: 0.118387 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008948 | Grad Max: 0.657061 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000053 | Grad Max: 0.002451 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002009 | Grad Max: 0.015753 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000139 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000459 | Grad Max: 0.001965 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000058 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000134 | Grad Max: 0.000615 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000417 | Grad Max: 0.001300 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002833 | Grad Max: 0.002833 [GRADIENT NORM TOTAL] 1.6051 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.567 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50349283 0.49650717] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.056 [MASKS] A(Pass/Fail): 653/1395 | B: 548/1308 | C: 228/1148 [LOSS Ex1] A: 0.65506 | B: 0.64730 | C: 0.64237 [LOGITS Ex2 A] Mean Abs: 1.932 | Max: 6.943 [LOSS Ex2] A: 0.13854 | B: 0.35965 | C: 0.27434 ** [JOINT LOSS] ** : 0.905754 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003110 | Grad Max: 0.095026 -> Layer: shared_layers.0.bias | Grad Mean: 0.199225 | Grad Max: 1.146479 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005689 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003286 | Grad Max: 0.003286 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001335 | Grad Max: 0.209819 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023845 | Grad Max: 1.161205 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000177 | Grad Max: 0.007931 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009963 | Grad Max: 0.067029 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000271 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002012 | Grad Max: 0.004715 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000137 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000555 | Grad Max: 0.001586 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000556 | Grad Max: 0.001803 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012090 | Grad Max: 0.012090 [GRADIENT NORM TOTAL] 4.5088 [EPOCH SUMMARY] Train Loss: 0.9115 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8969 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 99/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.099 | Max: 0.497 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53936666 0.46063334] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.544 | Std: 0.054 [MASKS] A(Pass/Fail): 650/1398 | B: 583/1465 | C: 346/1702 [LOSS Ex1] A: 0.65489 | B: 0.64639 | C: 0.63863 [LOGITS Ex2 A] Mean Abs: 1.912 | Max: 6.670 [LOSS Ex2] A: 0.16178 | B: 0.37320 | C: 0.28793 ** [JOINT LOSS] ** : 0.920942 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004663 | Grad Max: 0.198675 -> Layer: shared_layers.0.bias | Grad Mean: 0.094583 | Grad Max: 0.422707 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002107 | Grad Max: 0.006250 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005382 | Grad Max: 0.005382 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000893 | Grad Max: 0.146546 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013795 | Grad Max: 0.836719 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000081 | Grad Max: 0.005276 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002836 | Grad Max: 0.031861 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000155 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000335 | Grad Max: 0.001955 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000059 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000497 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000152 | Grad Max: 0.000651 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001144 | Grad Max: 0.001144 [GRADIENT NORM TOTAL] 2.3153 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.639 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6978184 0.3021816] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.057 [MASKS] A(Pass/Fail): 701/1347 | B: 583/1465 | C: 351/1697 [LOSS Ex1] A: 0.64986 | B: 0.64716 | C: 0.64164 [LOGITS Ex2 A] Mean Abs: 1.944 | Max: 6.581 [LOSS Ex2] A: 0.14955 | B: 0.37739 | C: 0.27303 ** [JOINT LOSS] ** : 0.912880 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005037 | Grad Max: 0.181466 -> Layer: shared_layers.0.bias | Grad Mean: 0.180834 | Grad Max: 0.754812 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006118 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003880 | Grad Max: 0.003880 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001211 | Grad Max: 0.398425 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021454 | Grad Max: 2.227343 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000158 | Grad Max: 0.004925 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008558 | Grad Max: 0.043669 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000296 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001998 | Grad Max: 0.005069 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000164 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000551 | Grad Max: 0.001648 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000625 | Grad Max: 0.001714 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011501 | Grad Max: 0.011501 [GRADIENT NORM TOTAL] 4.5192 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.714 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007374 0.49926254] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.056 [MASKS] A(Pass/Fail): 693/1355 | B: 595/1453 | C: 326/1722 [LOSS Ex1] A: 0.65554 | B: 0.64317 | C: 0.64059 [LOGITS Ex2 A] Mean Abs: 1.959 | Max: 5.940 [LOSS Ex2] A: 0.14091 | B: 0.35635 | C: 0.27957 ** [JOINT LOSS] ** : 0.905378 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003614 | Grad Max: 0.119542 -> Layer: shared_layers.0.bias | Grad Mean: 0.060995 | Grad Max: 0.447178 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002226 | Grad Max: 0.005639 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003320 | Grad Max: 0.003320 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000620 | Grad Max: 0.151932 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009743 | Grad Max: 0.858445 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.003630 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002040 | Grad Max: 0.019361 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000149 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000407 | Grad Max: 0.002078 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000117 | Grad Max: 0.000610 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000330 | Grad Max: 0.001124 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002154 | Grad Max: 0.002154 [GRADIENT NORM TOTAL] 1.9225 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.485 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6592902 0.3407098] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.057 [MASKS] A(Pass/Fail): 675/1373 | B: 550/1306 | C: 321/1727 [LOSS Ex1] A: 0.65213 | B: 0.64709 | C: 0.64338 [LOGITS Ex2 A] Mean Abs: 1.971 | Max: 5.883 [LOSS Ex2] A: 0.15423 | B: 0.35066 | C: 0.29177 ** [JOINT LOSS] ** : 0.913086 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002341 | Grad Max: 0.053266 -> Layer: shared_layers.0.bias | Grad Mean: 0.076369 | Grad Max: 0.282901 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002147 | Grad Max: 0.006108 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003131 | Grad Max: 0.003131 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000737 | Grad Max: 0.132429 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013166 | Grad Max: 0.723791 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.003083 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004739 | Grad Max: 0.024761 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000256 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001014 | Grad Max: 0.003109 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000099 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000294 | Grad Max: 0.000847 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000399 | Grad Max: 0.001218 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007132 | Grad Max: 0.007132 [GRADIENT NORM TOTAL] 2.3690 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.567 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.599739 0.400261] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.057 [MASKS] A(Pass/Fail): 565/1051 | B: 589/1459 | C: 322/1726 [LOSS Ex1] A: 0.65057 | B: 0.64616 | C: 0.64483 [LOGITS Ex2 A] Mean Abs: 1.986 | Max: 5.978 [LOSS Ex2] A: 0.14665 | B: 0.37238 | C: 0.29098 ** [JOINT LOSS] ** : 0.917190 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004051 | Grad Max: 0.122752 -> Layer: shared_layers.0.bias | Grad Mean: 0.153748 | Grad Max: 0.727666 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.006138 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005513 | Grad Max: 0.005513 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001088 | Grad Max: 0.221791 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018710 | Grad Max: 1.245827 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000113 | Grad Max: 0.007261 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005757 | Grad Max: 0.056577 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000245 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001020 | Grad Max: 0.003293 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000082 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000287 | Grad Max: 0.000921 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000328 | Grad Max: 0.001027 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007366 | Grad Max: 0.007366 [GRADIENT NORM TOTAL] 3.4932 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.717 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50706446 0.49293554] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.058 [MASKS] A(Pass/Fail): 685/1363 | B: 586/1462 | C: 330/1718 [LOSS Ex1] A: 0.65101 | B: 0.64693 | C: 0.64401 [LOGITS Ex2 A] Mean Abs: 1.958 | Max: 6.533 [LOSS Ex2] A: 0.15694 | B: 0.38455 | C: 0.26710 ** [JOINT LOSS] ** : 0.916846 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006526 | Grad Max: 0.287376 -> Layer: shared_layers.0.bias | Grad Mean: 0.088711 | Grad Max: 0.413574 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.005746 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004214 | Grad Max: 0.004214 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000968 | Grad Max: 0.129035 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014181 | Grad Max: 0.721700 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.003782 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002521 | Grad Max: 0.023948 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000210 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000416 | Grad Max: 0.002139 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000111 | Grad Max: 0.000589 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000224 | Grad Max: 0.000886 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000337 | Grad Max: 0.000337 [GRADIENT NORM TOTAL] 2.3964 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.670 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51074207 0.48925793] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.547 | Std: 0.057 [MASKS] A(Pass/Fail): 677/1371 | B: 595/1453 | C: 331/1717 [LOSS Ex1] A: 0.64848 | B: 0.64291 | C: 0.64228 [LOGITS Ex2 A] Mean Abs: 1.975 | Max: 6.932 [LOSS Ex2] A: 0.15132 | B: 0.35201 | C: 0.26996 ** [JOINT LOSS] ** : 0.902322 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005406 | Grad Max: 0.191616 -> Layer: shared_layers.0.bias | Grad Mean: 0.132959 | Grad Max: 0.629512 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002297 | Grad Max: 0.006617 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002801 | Grad Max: 0.002801 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000998 | Grad Max: 0.130605 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017158 | Grad Max: 0.690229 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.004768 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006669 | Grad Max: 0.037654 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000266 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001554 | Grad Max: 0.004104 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000115 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000420 | Grad Max: 0.001155 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000479 | Grad Max: 0.001513 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008880 | Grad Max: 0.008880 [GRADIENT NORM TOTAL] 2.8333 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.692 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50277233 0.4972276 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.547 | Std: 0.057 [MASKS] A(Pass/Fail): 674/1374 | B: 550/1306 | C: 363/1685 [LOSS Ex1] A: 0.64746 | B: 0.64684 | C: 0.63831 [LOGITS Ex2 A] Mean Abs: 1.939 | Max: 6.780 [LOSS Ex2] A: 0.16346 | B: 0.35414 | C: 0.28170 ** [JOINT LOSS] ** : 0.910634 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002954 | Grad Max: 0.074625 -> Layer: shared_layers.0.bias | Grad Mean: 0.210012 | Grad Max: 0.921174 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.006539 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001501 | Grad Max: 0.001501 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001448 | Grad Max: 0.198511 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026822 | Grad Max: 1.110695 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.011451 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012933 | Grad Max: 0.090793 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000364 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002727 | Grad Max: 0.006249 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000156 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000744 | Grad Max: 0.001710 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000770 | Grad Max: 0.001926 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016072 | Grad Max: 0.016072 [GRADIENT NORM TOTAL] 4.4046 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.573 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.503527 0.496473] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.545 | Std: 0.056 [MASKS] A(Pass/Fail): 654/1394 | B: 589/1459 | C: 355/1693 [LOSS Ex1] A: 0.65471 | B: 0.64591 | C: 0.63892 [LOGITS Ex2 A] Mean Abs: 1.925 | Max: 6.646 [LOSS Ex2] A: 0.14942 | B: 0.37511 | C: 0.25435 ** [JOINT LOSS] ** : 0.906142 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004833 | Grad Max: 0.227712 -> Layer: shared_layers.0.bias | Grad Mean: 0.086222 | Grad Max: 0.421480 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.006065 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006922 | Grad Max: 0.006922 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000907 | Grad Max: 0.084127 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015035 | Grad Max: 0.451856 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000121 | Grad Max: 0.005228 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006341 | Grad Max: 0.031658 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000328 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001551 | Grad Max: 0.004714 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000118 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000434 | Grad Max: 0.001246 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000572 | Grad Max: 0.001593 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009726 | Grad Max: 0.009726 [GRADIENT NORM TOTAL] 2.1604 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.100 | Max: 0.503 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5395455 0.46045455] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.545 | Std: 0.055 [MASKS] A(Pass/Fail): 650/1398 | B: 587/1461 | C: 383/1665 [LOSS Ex1] A: 0.65454 | B: 0.64668 | C: 0.63670 [LOGITS Ex2 A] Mean Abs: 1.942 | Max: 6.635 [LOSS Ex2] A: 0.16367 | B: 0.39300 | C: 0.29746 ** [JOINT LOSS] ** : 0.930686 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004982 | Grad Max: 0.213308 -> Layer: shared_layers.0.bias | Grad Mean: 0.532631 | Grad Max: 2.805570 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.006362 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006043 | Grad Max: 0.006044 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003430 | Grad Max: 0.409364 -> Layer: exit2_layers.0.bias | Grad Mean: 0.063531 | Grad Max: 2.289667 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000531 | Grad Max: 0.021397 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030777 | Grad Max: 0.182717 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000790 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006349 | Grad Max: 0.013457 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000314 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001695 | Grad Max: 0.003848 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001681 | Grad Max: 0.003270 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033673 | Grad Max: 0.033673 [GRADIENT NORM TOTAL] 11.2906 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.646 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6998182 0.3001818] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.058 [MASKS] A(Pass/Fail): 701/1347 | B: 595/1453 | C: 358/1690 [LOSS Ex1] A: 0.64947 | B: 0.64266 | C: 0.63878 [LOGITS Ex2 A] Mean Abs: 1.996 | Max: 7.641 [LOSS Ex2] A: 0.15528 | B: 0.36899 | C: 0.29220 ** [JOINT LOSS] ** : 0.915797 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004637 | Grad Max: 0.250811 -> Layer: shared_layers.0.bias | Grad Mean: 0.634245 | Grad Max: 2.920288 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002403 | Grad Max: 0.006849 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009956 | Grad Max: 0.009956 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003921 | Grad Max: 0.501487 -> Layer: exit2_layers.0.bias | Grad Mean: 0.073638 | Grad Max: 2.806187 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000670 | Grad Max: 0.027398 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039082 | Grad Max: 0.243056 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000902 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008150 | Grad Max: 0.016798 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000429 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002213 | Grad Max: 0.005410 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002240 | Grad Max: 0.003898 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046237 | Grad Max: 0.046237 [GRADIENT NORM TOTAL] 12.9200 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.722 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50069064 0.49930933] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.057 [MASKS] A(Pass/Fail): 695/1353 | B: 551/1305 | C: 357/1691 [LOSS Ex1] A: 0.65520 | B: 0.64662 | C: 0.64017 [LOGITS Ex2 A] Mean Abs: 1.960 | Max: 6.061 [LOSS Ex2] A: 0.14758 | B: 0.35546 | C: 0.25005 ** [JOINT LOSS] ** : 0.898357 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005335 | Grad Max: 0.178315 -> Layer: shared_layers.0.bias | Grad Mean: 0.214263 | Grad Max: 0.830343 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002090 | Grad Max: 0.005570 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000108 | Grad Max: 0.000108 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001616 | Grad Max: 0.247969 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027728 | Grad Max: 1.379961 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000219 | Grad Max: 0.009189 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012046 | Grad Max: 0.082772 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000322 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002311 | Grad Max: 0.005716 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000147 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000638 | Grad Max: 0.001952 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000616 | Grad Max: 0.002107 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014136 | Grad Max: 0.014136 [GRADIENT NORM TOTAL] 4.5596 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.490 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6607528 0.33924723] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.057 [MASKS] A(Pass/Fail): 675/1373 | B: 589/1459 | C: 352/1696 [LOSS Ex1] A: 0.65176 | B: 0.64569 | C: 0.63993 [LOGITS Ex2 A] Mean Abs: 1.905 | Max: 6.246 [LOSS Ex2] A: 0.16694 | B: 0.38354 | C: 0.29506 ** [JOINT LOSS] ** : 0.927640 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009019 | Grad Max: 0.269181 -> Layer: shared_layers.0.bias | Grad Mean: 0.451755 | Grad Max: 1.926358 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006223 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001505 | Grad Max: 0.001505 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002963 | Grad Max: 0.483830 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054606 | Grad Max: 2.708512 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000445 | Grad Max: 0.015873 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025394 | Grad Max: 0.135724 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000662 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005583 | Grad Max: 0.011440 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000307 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001510 | Grad Max: 0.003773 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001595 | Grad Max: 0.003014 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031550 | Grad Max: 0.031550 [GRADIENT NORM TOTAL] 9.5256 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.572 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6005669 0.3994331] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.058 [MASKS] A(Pass/Fail): 566/1050 | B: 587/1461 | C: 231/1145 [LOSS Ex1] A: 0.65020 | B: 0.64648 | C: 0.64229 [LOGITS Ex2 A] Mean Abs: 1.943 | Max: 6.436 [LOSS Ex2] A: 0.14924 | B: 0.40213 | C: 0.29281 ** [JOINT LOSS] ** : 0.927715 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009603 | Grad Max: 0.301469 -> Layer: shared_layers.0.bias | Grad Mean: 0.663728 | Grad Max: 2.867584 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.006344 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003636 | Grad Max: 0.003636 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004200 | Grad Max: 0.624758 -> Layer: exit2_layers.0.bias | Grad Mean: 0.078505 | Grad Max: 3.443604 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000661 | Grad Max: 0.022856 -> Layer: exit2_layers.3.bias | Grad Mean: 0.038256 | Grad Max: 0.208065 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000840 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008207 | Grad Max: 0.016443 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000430 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002250 | Grad Max: 0.005046 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002377 | Grad Max: 0.004655 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048245 | Grad Max: 0.048245 [GRADIENT NORM TOTAL] 13.6166 [EPOCH SUMMARY] Train Loss: 0.9147 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8970 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 100/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.724 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50714004 0.49286 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.058 [MASKS] A(Pass/Fail): 686/1362 | B: 595/1453 | C: 376/1672 [LOSS Ex1] A: 0.65066 | B: 0.64246 | C: 0.63743 [LOGITS Ex2 A] Mean Abs: 1.940 | Max: 6.340 [LOSS Ex2] A: 0.14117 | B: 0.36981 | C: 0.28465 ** [JOINT LOSS] ** : 0.908725 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005817 | Grad Max: 0.154150 -> Layer: shared_layers.0.bias | Grad Mean: 0.461543 | Grad Max: 2.078842 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002329 | Grad Max: 0.006028 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000349 | Grad Max: 0.000349 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002873 | Grad Max: 0.450988 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053843 | Grad Max: 2.525108 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000443 | Grad Max: 0.016347 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025759 | Grad Max: 0.146840 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000589 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005449 | Grad Max: 0.010587 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000261 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001478 | Grad Max: 0.003235 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001510 | Grad Max: 0.002667 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030893 | Grad Max: 0.030893 [GRADIENT NORM TOTAL] 9.5671 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.677 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51071805 0.489282 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.058 [MASKS] A(Pass/Fail): 679/1369 | B: 551/1305 | C: 384/1664 [LOSS Ex1] A: 0.64813 | B: 0.64643 | C: 0.63772 [LOGITS Ex2 A] Mean Abs: 1.966 | Max: 5.603 [LOSS Ex2] A: 0.14941 | B: 0.34913 | C: 0.29525 ** [JOINT LOSS] ** : 0.908691 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005582 | Grad Max: 0.184128 -> Layer: shared_layers.0.bias | Grad Mean: 0.118806 | Grad Max: 0.557535 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002347 | Grad Max: 0.006982 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009199 | Grad Max: 0.009199 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001094 | Grad Max: 0.157771 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018526 | Grad Max: 0.857067 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000161 | Grad Max: 0.005486 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008836 | Grad Max: 0.041172 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000296 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002061 | Grad Max: 0.005135 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000153 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000559 | Grad Max: 0.001712 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000557 | Grad Max: 0.001338 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011206 | Grad Max: 0.011206 [GRADIENT NORM TOTAL] 2.6726 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.698 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5028224 0.4971776] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.057 [MASKS] A(Pass/Fail): 677/1371 | B: 589/1459 | C: 339/1709 [LOSS Ex1] A: 0.64712 | B: 0.64551 | C: 0.64213 [LOGITS Ex2 A] Mean Abs: 1.946 | Max: 8.072 [LOSS Ex2] A: 0.16552 | B: 0.37274 | C: 0.29654 ** [JOINT LOSS] ** : 0.923188 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006238 | Grad Max: 0.256179 -> Layer: shared_layers.0.bias | Grad Mean: 0.224252 | Grad Max: 0.906970 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002293 | Grad Max: 0.006803 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005780 | Grad Max: 0.005780 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001598 | Grad Max: 0.200810 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027485 | Grad Max: 1.121197 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000257 | Grad Max: 0.009542 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014390 | Grad Max: 0.075749 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000455 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003226 | Grad Max: 0.007650 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000195 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000880 | Grad Max: 0.002158 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000871 | Grad Max: 0.001820 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017663 | Grad Max: 0.017663 [GRADIENT NORM TOTAL] 4.5137 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.577 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034805 0.4965195] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.057 [MASKS] A(Pass/Fail): 655/1393 | B: 587/1461 | C: 383/1665 [LOSS Ex1] A: 0.65442 | B: 0.64630 | C: 0.63761 [LOGITS Ex2 A] Mean Abs: 1.893 | Max: 5.919 [LOSS Ex2] A: 0.14406 | B: 0.38353 | C: 0.24818 ** [JOINT LOSS] ** : 0.904701 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002543 | Grad Max: 0.070123 -> Layer: shared_layers.0.bias | Grad Mean: 0.206282 | Grad Max: 0.905786 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002113 | Grad Max: 0.005982 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004674 | Grad Max: 0.004674 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001337 | Grad Max: 0.108417 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024208 | Grad Max: 0.596562 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.008216 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012851 | Grad Max: 0.074537 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000352 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002611 | Grad Max: 0.006445 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000154 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000708 | Grad Max: 0.001698 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000732 | Grad Max: 0.001845 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015154 | Grad Max: 0.015154 [GRADIENT NORM TOTAL] 4.0296 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.508 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5397359 0.46026412] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.545 | Std: 0.055 [MASKS] A(Pass/Fail): 650/1398 | B: 595/1453 | C: 374/1674 [LOSS Ex1] A: 0.65428 | B: 0.64228 | C: 0.63853 [LOGITS Ex2 A] Mean Abs: 1.880 | Max: 6.076 [LOSS Ex2] A: 0.15703 | B: 0.35436 | C: 0.25833 ** [JOINT LOSS] ** : 0.901602 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002436 | Grad Max: 0.067971 -> Layer: shared_layers.0.bias | Grad Mean: 0.119079 | Grad Max: 0.686614 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.006434 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006356 | Grad Max: 0.006356 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000877 | Grad Max: 0.210266 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015702 | Grad Max: 1.169948 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.004807 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007278 | Grad Max: 0.040859 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000241 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001576 | Grad Max: 0.004185 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000124 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000422 | Grad Max: 0.001317 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000444 | Grad Max: 0.001545 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008410 | Grad Max: 0.008410 [GRADIENT NORM TOTAL] 2.7253 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.651 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.70143205 0.29856795] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.058 [MASKS] A(Pass/Fail): 701/1347 | B: 551/1305 | C: 360/1688 [LOSS Ex1] A: 0.64919 | B: 0.64625 | C: 0.64133 [LOGITS Ex2 A] Mean Abs: 1.951 | Max: 6.021 [LOSS Ex2] A: 0.14425 | B: 0.35340 | C: 0.29556 ** [JOINT LOSS] ** : 0.909994 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004375 | Grad Max: 0.141982 -> Layer: shared_layers.0.bias | Grad Mean: 0.360922 | Grad Max: 1.796864 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.006229 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005185 | Grad Max: 0.005185 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002362 | Grad Max: 0.365596 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044099 | Grad Max: 2.035900 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000368 | Grad Max: 0.014247 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021550 | Grad Max: 0.133633 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000515 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004612 | Grad Max: 0.009916 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000229 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001267 | Grad Max: 0.003016 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001303 | Grad Max: 0.002832 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027208 | Grad Max: 0.027208 [GRADIENT NORM TOTAL] 7.7475 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.727 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007449 0.49925512] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.057 [MASKS] A(Pass/Fail): 695/1353 | B: 590/1458 | C: 356/1692 [LOSS Ex1] A: 0.65495 | B: 0.64533 | C: 0.64113 [LOGITS Ex2 A] Mean Abs: 1.968 | Max: 5.586 [LOSS Ex2] A: 0.14968 | B: 0.36760 | C: 0.26440 ** [JOINT LOSS] ** : 0.907698 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004321 | Grad Max: 0.155414 -> Layer: shared_layers.0.bias | Grad Mean: 0.363930 | Grad Max: 1.980787 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.005721 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002159 | Grad Max: 0.002159 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002391 | Grad Max: 0.299329 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042835 | Grad Max: 1.675438 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000353 | Grad Max: 0.014109 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020377 | Grad Max: 0.120519 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000524 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004077 | Grad Max: 0.008853 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000193 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001102 | Grad Max: 0.002574 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001023 | Grad Max: 0.002434 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022918 | Grad Max: 0.022918 [GRADIENT NORM TOTAL] 7.7167 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.495 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6619067 0.33809328] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.058 [MASKS] A(Pass/Fail): 676/1372 | B: 588/1460 | C: 397/1651 [LOSS Ex1] A: 0.65151 | B: 0.64612 | C: 0.63640 [LOGITS Ex2 A] Mean Abs: 1.940 | Max: 6.783 [LOSS Ex2] A: 0.16010 | B: 0.36261 | C: 0.28385 ** [JOINT LOSS] ** : 0.913529 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004801 | Grad Max: 0.215230 -> Layer: shared_layers.0.bias | Grad Mean: 0.111629 | Grad Max: 0.441943 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.005842 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000865 | Grad Max: 0.000865 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001068 | Grad Max: 0.299396 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016923 | Grad Max: 1.670574 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000083 | Grad Max: 0.005645 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002976 | Grad Max: 0.040165 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000177 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000342 | Grad Max: 0.002687 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000059 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000078 | Grad Max: 0.000457 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000261 | Grad Max: 0.000719 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000060 | Grad Max: 0.000060 [GRADIENT NORM TOTAL] 3.1949 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.577 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.60130453 0.3986955 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.058 [MASKS] A(Pass/Fail): 569/1047 | B: 595/1453 | C: 342/1706 [LOSS Ex1] A: 0.64994 | B: 0.64209 | C: 0.64159 [LOGITS Ex2 A] Mean Abs: 1.959 | Max: 6.325 [LOSS Ex2] A: 0.14776 | B: 0.36245 | C: 0.29461 ** [JOINT LOSS] ** : 0.912815 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008347 | Grad Max: 0.248791 -> Layer: shared_layers.0.bias | Grad Mean: 0.437573 | Grad Max: 1.962619 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002224 | Grad Max: 0.006474 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004923 | Grad Max: 0.004923 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003007 | Grad Max: 0.508107 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054959 | Grad Max: 2.834235 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000457 | Grad Max: 0.016484 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026302 | Grad Max: 0.155125 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000575 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005777 | Grad Max: 0.011339 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000290 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001589 | Grad Max: 0.003429 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001692 | Grad Max: 0.003027 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033774 | Grad Max: 0.033774 [GRADIENT NORM TOTAL] 9.3294 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.729 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071095 0.4928905] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.058 [MASKS] A(Pass/Fail): 686/1362 | B: 551/1305 | C: 373/1675 [LOSS Ex1] A: 0.65041 | B: 0.64608 | C: 0.63797 [LOGITS Ex2 A] Mean Abs: 1.932 | Max: 7.262 [LOSS Ex2] A: 0.14570 | B: 0.36462 | C: 0.27459 ** [JOINT LOSS] ** : 0.906455 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006228 | Grad Max: 0.218407 -> Layer: shared_layers.0.bias | Grad Mean: 0.420264 | Grad Max: 1.838217 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.006036 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001649 | Grad Max: 0.001649 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002667 | Grad Max: 0.437925 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049107 | Grad Max: 2.460638 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000415 | Grad Max: 0.014186 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024186 | Grad Max: 0.122641 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000603 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005246 | Grad Max: 0.011709 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000276 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001400 | Grad Max: 0.003477 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001330 | Grad Max: 0.002640 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027646 | Grad Max: 0.027646 [GRADIENT NORM TOTAL] 8.4343 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.682 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108116 0.48918837] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.058 [MASKS] A(Pass/Fail): 679/1369 | B: 590/1458 | C: 382/1666 [LOSS Ex1] A: 0.64785 | B: 0.64517 | C: 0.64249 [LOGITS Ex2 A] Mean Abs: 1.947 | Max: 5.656 [LOSS Ex2] A: 0.15268 | B: 0.37899 | C: 0.27146 ** [JOINT LOSS] ** : 0.912879 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004128 | Grad Max: 0.128807 -> Layer: shared_layers.0.bias | Grad Mean: 0.080290 | Grad Max: 0.315624 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.005938 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003415 | Grad Max: 0.003415 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000685 | Grad Max: 0.192103 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010979 | Grad Max: 1.033822 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.003707 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001889 | Grad Max: 0.017395 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000155 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000366 | Grad Max: 0.002167 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000064 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000093 | Grad Max: 0.000508 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000188 | Grad Max: 0.000691 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001119 | Grad Max: 0.001119 [GRADIENT NORM TOTAL] 2.2742 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.704 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50283486 0.49716514] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.058 [MASKS] A(Pass/Fail): 678/1370 | B: 590/1458 | C: 360/1688 [LOSS Ex1] A: 0.64685 | B: 0.64596 | C: 0.64174 [LOGITS Ex2 A] Mean Abs: 1.939 | Max: 7.114 [LOSS Ex2] A: 0.17289 | B: 0.38300 | C: 0.27349 ** [JOINT LOSS] ** : 0.921309 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008342 | Grad Max: 0.278815 -> Layer: shared_layers.0.bias | Grad Mean: 0.396617 | Grad Max: 1.675713 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002283 | Grad Max: 0.006872 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006483 | Grad Max: 0.006483 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002701 | Grad Max: 0.338292 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048470 | Grad Max: 1.885007 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000410 | Grad Max: 0.013876 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023412 | Grad Max: 0.127691 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000552 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005215 | Grad Max: 0.010615 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000274 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001431 | Grad Max: 0.003205 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001404 | Grad Max: 0.002933 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029218 | Grad Max: 0.029218 [GRADIENT NORM TOTAL] 7.9446 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.582 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034557 0.49654427] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.057 [MASKS] A(Pass/Fail): 655/1393 | B: 595/1453 | C: 365/1683 [LOSS Ex1] A: 0.65418 | B: 0.64193 | C: 0.64105 [LOGITS Ex2 A] Mean Abs: 1.919 | Max: 5.973 [LOSS Ex2] A: 0.14167 | B: 0.36094 | C: 0.27664 ** [JOINT LOSS] ** : 0.905466 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004224 | Grad Max: 0.131551 -> Layer: shared_layers.0.bias | Grad Mean: 0.234346 | Grad Max: 0.858160 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.006135 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006884 | Grad Max: 0.006884 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001514 | Grad Max: 0.226866 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027523 | Grad Max: 1.133134 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000209 | Grad Max: 0.008571 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012065 | Grad Max: 0.075100 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000346 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002647 | Grad Max: 0.006221 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000129 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000727 | Grad Max: 0.001596 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000755 | Grad Max: 0.002177 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015737 | Grad Max: 0.015737 [GRADIENT NORM TOTAL] 4.7485 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.101 | Max: 0.512 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.539877 0.46012303] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.545 | Std: 0.055 [MASKS] A(Pass/Fail): 652/1396 | B: 553/1303 | C: 220/1156 [LOSS Ex1] A: 0.65403 | B: 0.64593 | C: 0.64352 [LOGITS Ex2 A] Mean Abs: 1.856 | Max: 6.060 [LOSS Ex2] A: 0.15246 | B: 0.35642 | C: 0.26574 ** [JOINT LOSS] ** : 0.906031 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003294 | Grad Max: 0.063640 -> Layer: shared_layers.0.bias | Grad Mean: 0.219714 | Grad Max: 1.069928 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002027 | Grad Max: 0.005892 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005910 | Grad Max: 0.005910 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001510 | Grad Max: 0.192377 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028070 | Grad Max: 1.096393 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000248 | Grad Max: 0.009411 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014274 | Grad Max: 0.076426 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000377 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002964 | Grad Max: 0.006729 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000185 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000764 | Grad Max: 0.002234 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000683 | Grad Max: 0.002056 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014071 | Grad Max: 0.014071 [GRADIENT NORM TOTAL] 4.6629 [EPOCH SUMMARY] Train Loss: 0.9102 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8902 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8952 -> New: 0.8902) ############################## EPOCH 101/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.656 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.70274216 0.2972578 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.058 [MASKS] A(Pass/Fail): 703/1345 | B: 591/1457 | C: 380/1668 [LOSS Ex1] A: 0.64893 | B: 0.64500 | C: 0.63880 [LOGITS Ex2 A] Mean Abs: 1.891 | Max: 6.458 [LOSS Ex2] A: 0.14451 | B: 0.37709 | C: 0.26814 ** [JOINT LOSS] ** : 0.907494 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.038686 -> Layer: shared_layers.0.bias | Grad Mean: 0.139843 | Grad Max: 0.564764 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.006667 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008075 | Grad Max: 0.008075 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001035 | Grad Max: 0.135715 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018849 | Grad Max: 0.747233 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000158 | Grad Max: 0.005727 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009201 | Grad Max: 0.049766 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000307 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001927 | Grad Max: 0.005181 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000138 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000519 | Grad Max: 0.001473 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000485 | Grad Max: 0.001434 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010272 | Grad Max: 0.010272 [GRADIENT NORM TOTAL] 3.1320 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.733 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50068337 0.49931663] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.057 [MASKS] A(Pass/Fail): 696/1352 | B: 592/1456 | C: 359/1689 [LOSS Ex1] A: 0.65472 | B: 0.64580 | C: 0.64034 [LOGITS Ex2 A] Mean Abs: 1.931 | Max: 5.809 [LOSS Ex2] A: 0.14463 | B: 0.37103 | C: 0.26920 ** [JOINT LOSS] ** : 0.908571 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003137 | Grad Max: 0.127879 -> Layer: shared_layers.0.bias | Grad Mean: 0.290398 | Grad Max: 1.575827 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.005470 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003401 | Grad Max: 0.003401 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001836 | Grad Max: 0.282147 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033828 | Grad Max: 1.582695 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000260 | Grad Max: 0.011429 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015207 | Grad Max: 0.103402 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000389 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003094 | Grad Max: 0.006896 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000164 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000828 | Grad Max: 0.002091 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000706 | Grad Max: 0.002442 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016183 | Grad Max: 0.016183 [GRADIENT NORM TOTAL] 6.3822 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.498 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.66274685 0.3372532 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.058 [MASKS] A(Pass/Fail): 677/1371 | B: 595/1453 | C: 359/1689 [LOSS Ex1] A: 0.65125 | B: 0.64176 | C: 0.64380 [LOGITS Ex2 A] Mean Abs: 1.931 | Max: 5.747 [LOSS Ex2] A: 0.15702 | B: 0.35995 | C: 0.27701 ** [JOINT LOSS] ** : 0.910263 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003006 | Grad Max: 0.108121 -> Layer: shared_layers.0.bias | Grad Mean: 0.265343 | Grad Max: 1.436825 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.005576 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001805 | Grad Max: 0.001805 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001794 | Grad Max: 0.260526 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032448 | Grad Max: 1.461302 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000243 | Grad Max: 0.012524 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014200 | Grad Max: 0.103645 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000336 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002928 | Grad Max: 0.006610 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000191 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000782 | Grad Max: 0.002388 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000668 | Grad Max: 0.001799 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015594 | Grad Max: 0.015594 [GRADIENT NORM TOTAL] 6.0342 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.581 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6017169 0.39828318] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.058 [MASKS] A(Pass/Fail): 569/1047 | B: 554/1302 | C: 399/1649 [LOSS Ex1] A: 0.64969 | B: 0.64577 | C: 0.63676 [LOGITS Ex2 A] Mean Abs: 1.952 | Max: 6.438 [LOSS Ex2] A: 0.14697 | B: 0.35226 | C: 0.27164 ** [JOINT LOSS] ** : 0.901029 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003471 | Grad Max: 0.106191 -> Layer: shared_layers.0.bias | Grad Mean: 0.128970 | Grad Max: 0.522312 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002162 | Grad Max: 0.006500 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003893 | Grad Max: 0.003893 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000900 | Grad Max: 0.078952 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016111 | Grad Max: 0.434302 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000145 | Grad Max: 0.005744 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008104 | Grad Max: 0.042461 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000278 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001731 | Grad Max: 0.004800 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000127 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000464 | Grad Max: 0.001373 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000516 | Grad Max: 0.001580 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009354 | Grad Max: 0.009354 [GRADIENT NORM TOTAL] 2.4529 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.735 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071762 0.49282378] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.059 [MASKS] A(Pass/Fail): 686/1362 | B: 591/1457 | C: 406/1642 [LOSS Ex1] A: 0.65016 | B: 0.64484 | C: 0.63508 [LOGITS Ex2 A] Mean Abs: 1.956 | Max: 6.693 [LOSS Ex2] A: 0.14687 | B: 0.36545 | C: 0.28073 ** [JOINT LOSS] ** : 0.907712 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002720 | Grad Max: 0.112867 -> Layer: shared_layers.0.bias | Grad Mean: 0.107929 | Grad Max: 0.497126 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.005987 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002305 | Grad Max: 0.002305 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000859 | Grad Max: 0.329640 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014823 | Grad Max: 1.834348 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.004455 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004589 | Grad Max: 0.030420 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000204 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000881 | Grad Max: 0.003300 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000076 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000229 | Grad Max: 0.000807 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000320 | Grad Max: 0.001102 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004064 | Grad Max: 0.004064 [GRADIENT NORM TOTAL] 3.2317 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.687 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51071924 0.48928076] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.058 [MASKS] A(Pass/Fail): 681/1367 | B: 594/1454 | C: 385/1663 [LOSS Ex1] A: 0.64759 | B: 0.64563 | C: 0.64256 [LOGITS Ex2 A] Mean Abs: 1.942 | Max: 6.360 [LOSS Ex2] A: 0.15014 | B: 0.37294 | C: 0.28961 ** [JOINT LOSS] ** : 0.916158 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002145 | Grad Max: 0.049300 -> Layer: shared_layers.0.bias | Grad Mean: 0.115825 | Grad Max: 0.533249 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006356 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005648 | Grad Max: 0.005648 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000948 | Grad Max: 0.344423 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016944 | Grad Max: 1.925452 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000109 | Grad Max: 0.005679 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006159 | Grad Max: 0.043075 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000269 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001306 | Grad Max: 0.004492 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000109 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000339 | Grad Max: 0.001188 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000318 | Grad Max: 0.001107 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006022 | Grad Max: 0.006022 [GRADIENT NORM TOTAL] 3.6133 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.709 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029533 0.4970467] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.058 [MASKS] A(Pass/Fail): 680/1368 | B: 595/1453 | C: 371/1677 [LOSS Ex1] A: 0.64659 | B: 0.64158 | C: 0.64221 [LOGITS Ex2 A] Mean Abs: 1.935 | Max: 6.623 [LOSS Ex2] A: 0.16042 | B: 0.35340 | C: 0.27470 ** [JOINT LOSS] ** : 0.906298 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002587 | Grad Max: 0.085797 -> Layer: shared_layers.0.bias | Grad Mean: 0.149927 | Grad Max: 0.670079 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006240 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002283 | Grad Max: 0.002283 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001037 | Grad Max: 0.147198 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018021 | Grad Max: 0.798465 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.007346 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006822 | Grad Max: 0.055697 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000240 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001298 | Grad Max: 0.003437 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000096 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000344 | Grad Max: 0.001097 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000369 | Grad Max: 0.001239 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007417 | Grad Max: 0.007417 [GRADIENT NORM TOTAL] 3.1952 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.586 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034871 0.49651292] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.057 [MASKS] A(Pass/Fail): 656/1392 | B: 554/1302 | C: 383/1665 [LOSS Ex1] A: 0.65394 | B: 0.64559 | C: 0.64041 [LOGITS Ex2 A] Mean Abs: 1.909 | Max: 6.737 [LOSS Ex2] A: 0.14201 | B: 0.35409 | C: 0.29118 ** [JOINT LOSS] ** : 0.909077 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002930 | Grad Max: 0.083788 -> Layer: shared_layers.0.bias | Grad Mean: 0.134515 | Grad Max: 0.609364 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.005554 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003716 | Grad Max: 0.003716 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000921 | Grad Max: 0.159714 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015876 | Grad Max: 0.823228 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.006371 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007089 | Grad Max: 0.045934 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000231 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001389 | Grad Max: 0.003869 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000371 | Grad Max: 0.001113 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000397 | Grad Max: 0.001205 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008225 | Grad Max: 0.008225 [GRADIENT NORM TOTAL] 2.7973 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.516 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.53990114 0.46009883] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.545 | Std: 0.056 [MASKS] A(Pass/Fail): 652/1396 | B: 592/1456 | C: 395/1653 [LOSS Ex1] A: 0.65380 | B: 0.64465 | C: 0.63627 [LOGITS Ex2 A] Mean Abs: 1.926 | Max: 6.484 [LOSS Ex2] A: 0.15404 | B: 0.37569 | C: 0.28011 ** [JOINT LOSS] ** : 0.914856 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004926 | Grad Max: 0.136862 -> Layer: shared_layers.0.bias | Grad Mean: 0.258351 | Grad Max: 1.206683 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.007129 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011910 | Grad Max: 0.011910 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001720 | Grad Max: 0.231437 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031807 | Grad Max: 1.284966 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000260 | Grad Max: 0.008544 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015002 | Grad Max: 0.078891 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000390 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003206 | Grad Max: 0.007101 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000168 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000848 | Grad Max: 0.001860 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000820 | Grad Max: 0.001774 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016701 | Grad Max: 0.016701 [GRADIENT NORM TOTAL] 5.2784 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.662 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7040736 0.29592642] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.059 [MASKS] A(Pass/Fail): 703/1345 | B: 594/1454 | C: 398/1650 [LOSS Ex1] A: 0.64868 | B: 0.64544 | C: 0.63728 [LOGITS Ex2 A] Mean Abs: 1.974 | Max: 6.504 [LOSS Ex2] A: 0.14851 | B: 0.37791 | C: 0.25443 ** [JOINT LOSS] ** : 0.904079 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004681 | Grad Max: 0.124899 -> Layer: shared_layers.0.bias | Grad Mean: 0.330295 | Grad Max: 1.586409 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006208 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004013 | Grad Max: 0.004013 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.292576 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039293 | Grad Max: 1.636535 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000335 | Grad Max: 0.012158 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019593 | Grad Max: 0.110384 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000501 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004196 | Grad Max: 0.009201 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000236 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001146 | Grad Max: 0.002798 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001119 | Grad Max: 0.002717 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023706 | Grad Max: 0.023706 [GRADIENT NORM TOTAL] 6.8266 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.739 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50065154 0.49934843] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.058 [MASKS] A(Pass/Fail): 696/1352 | B: 595/1453 | C: 356/1692 [LOSS Ex1] A: 0.65448 | B: 0.64138 | C: 0.63899 [LOGITS Ex2 A] Mean Abs: 1.954 | Max: 6.826 [LOSS Ex2] A: 0.14369 | B: 0.35858 | C: 0.25730 ** [JOINT LOSS] ** : 0.898141 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003970 | Grad Max: 0.182386 -> Layer: shared_layers.0.bias | Grad Mean: 0.120905 | Grad Max: 0.456346 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.005667 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000603 | Grad Max: 0.000603 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000923 | Grad Max: 0.171880 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015292 | Grad Max: 0.978465 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000106 | Grad Max: 0.004983 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005605 | Grad Max: 0.036028 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000270 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001345 | Grad Max: 0.003922 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000108 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000356 | Grad Max: 0.001253 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000421 | Grad Max: 0.001480 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006942 | Grad Max: 0.006942 [GRADIENT NORM TOTAL] 2.6815 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.502 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6638131 0.33618686] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.058 [MASKS] A(Pass/Fail): 677/1371 | B: 554/1302 | C: 383/1665 [LOSS Ex1] A: 0.65099 | B: 0.64540 | C: 0.63851 [LOGITS Ex2 A] Mean Abs: 1.931 | Max: 5.797 [LOSS Ex2] A: 0.15783 | B: 0.36422 | C: 0.26529 ** [JOINT LOSS] ** : 0.907412 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005356 | Grad Max: 0.194325 -> Layer: shared_layers.0.bias | Grad Mean: 0.259038 | Grad Max: 0.958494 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.006303 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002764 | Grad Max: 0.002764 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001707 | Grad Max: 0.183730 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030736 | Grad Max: 1.001492 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000267 | Grad Max: 0.009383 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015314 | Grad Max: 0.077703 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000429 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003385 | Grad Max: 0.008195 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000188 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000909 | Grad Max: 0.002174 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000848 | Grad Max: 0.002154 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017606 | Grad Max: 0.017606 [GRADIENT NORM TOTAL] 4.8950 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.585 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.602245 0.397755] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.059 [MASKS] A(Pass/Fail): 570/1046 | B: 596/1452 | C: 357/1691 [LOSS Ex1] A: 0.64941 | B: 0.64447 | C: 0.64184 [LOGITS Ex2 A] Mean Abs: 1.983 | Max: 5.726 [LOSS Ex2] A: 0.14821 | B: 0.36864 | C: 0.27467 ** [JOINT LOSS] ** : 0.909075 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.065982 -> Layer: shared_layers.0.bias | Grad Mean: 0.137886 | Grad Max: 0.579281 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.006389 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002317 | Grad Max: 0.002317 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000924 | Grad Max: 0.174097 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016586 | Grad Max: 0.966999 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000130 | Grad Max: 0.005308 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007459 | Grad Max: 0.043230 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000257 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001525 | Grad Max: 0.004498 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000104 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000405 | Grad Max: 0.001089 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000388 | Grad Max: 0.001473 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008379 | Grad Max: 0.008379 [GRADIENT NORM TOTAL] 3.0259 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.741 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072345 0.4927655] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.059 [MASKS] A(Pass/Fail): 688/1360 | B: 596/1452 | C: 274/1102 [LOSS Ex1] A: 0.64989 | B: 0.64525 | C: 0.63692 [LOGITS Ex2 A] Mean Abs: 1.956 | Max: 7.626 [LOSS Ex2] A: 0.14213 | B: 0.37338 | C: 0.28134 ** [JOINT LOSS] ** : 0.909636 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003248 | Grad Max: 0.127789 -> Layer: shared_layers.0.bias | Grad Mean: 0.136944 | Grad Max: 0.504707 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.005965 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001403 | Grad Max: 0.001403 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001018 | Grad Max: 0.168720 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017558 | Grad Max: 0.928505 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000118 | Grad Max: 0.005404 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006590 | Grad Max: 0.040196 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000274 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001530 | Grad Max: 0.004586 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000111 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000405 | Grad Max: 0.001054 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.001484 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007652 | Grad Max: 0.007652 [GRADIENT NORM TOTAL] 3.1810 [EPOCH SUMMARY] Train Loss: 0.9078 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8880 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8902 -> New: 0.8880) ############################## EPOCH 102/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.692 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51075923 0.48924074] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.059 [MASKS] A(Pass/Fail): 685/1363 | B: 598/1450 | C: 367/1681 [LOSS Ex1] A: 0.64729 | B: 0.64119 | C: 0.64147 [LOGITS Ex2 A] Mean Abs: 1.932 | Max: 6.682 [LOSS Ex2] A: 0.14265 | B: 0.36212 | C: 0.26600 ** [JOINT LOSS] ** : 0.900240 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.057704 -> Layer: shared_layers.0.bias | Grad Mean: 0.150876 | Grad Max: 0.621492 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002419 | Grad Max: 0.007047 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012567 | Grad Max: 0.012567 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000992 | Grad Max: 0.214372 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017511 | Grad Max: 1.188504 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000133 | Grad Max: 0.005275 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007751 | Grad Max: 0.043650 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000254 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001573 | Grad Max: 0.004595 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000413 | Grad Max: 0.001206 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000387 | Grad Max: 0.001313 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008202 | Grad Max: 0.008202 [GRADIENT NORM TOTAL] 3.1507 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.715 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029694 0.49703065] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.058 [MASKS] A(Pass/Fail): 681/1367 | B: 557/1299 | C: 389/1659 [LOSS Ex1] A: 0.64629 | B: 0.64521 | C: 0.63635 [LOGITS Ex2 A] Mean Abs: 1.923 | Max: 6.244 [LOSS Ex2] A: 0.15846 | B: 0.35505 | C: 0.26837 ** [JOINT LOSS] ** : 0.903240 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004401 | Grad Max: 0.211652 -> Layer: shared_layers.0.bias | Grad Mean: 0.150731 | Grad Max: 0.560755 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.006685 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003456 | Grad Max: 0.003456 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001185 | Grad Max: 0.151749 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019503 | Grad Max: 0.838555 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000138 | Grad Max: 0.005385 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007116 | Grad Max: 0.046132 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000201 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001236 | Grad Max: 0.004012 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000099 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000316 | Grad Max: 0.000972 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000329 | Grad Max: 0.001171 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006797 | Grad Max: 0.006797 [GRADIENT NORM TOTAL] 3.1374 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.591 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.503421 0.49657896] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.546 | Std: 0.058 [MASKS] A(Pass/Fail): 656/1392 | B: 597/1451 | C: 389/1659 [LOSS Ex1] A: 0.65368 | B: 0.64427 | C: 0.63928 [LOGITS Ex2 A] Mean Abs: 1.942 | Max: 6.737 [LOSS Ex2] A: 0.14375 | B: 0.36669 | C: 0.27994 ** [JOINT LOSS] ** : 0.909200 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003472 | Grad Max: 0.092154 -> Layer: shared_layers.0.bias | Grad Mean: 0.217693 | Grad Max: 1.070309 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.006082 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005410 | Grad Max: 0.005411 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001389 | Grad Max: 0.165155 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025495 | Grad Max: 0.914001 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000214 | Grad Max: 0.010089 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012501 | Grad Max: 0.083132 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000319 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002642 | Grad Max: 0.006034 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000158 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000703 | Grad Max: 0.001707 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000674 | Grad Max: 0.001752 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013790 | Grad Max: 0.013790 [GRADIENT NORM TOTAL] 4.3515 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.102 | Max: 0.521 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5401172 0.4598828] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.546 | Std: 0.056 [MASKS] A(Pass/Fail): 654/1394 | B: 596/1452 | C: 352/1696 [LOSS Ex1] A: 0.65354 | B: 0.64506 | C: 0.64049 [LOGITS Ex2 A] Mean Abs: 1.907 | Max: 7.297 [LOSS Ex2] A: 0.15080 | B: 0.36545 | C: 0.26290 ** [JOINT LOSS] ** : 0.906076 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002323 | Grad Max: 0.052785 -> Layer: shared_layers.0.bias | Grad Mean: 0.118251 | Grad Max: 0.627693 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.006291 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007738 | Grad Max: 0.007738 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000972 | Grad Max: 0.315568 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017421 | Grad Max: 1.763107 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.005061 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005610 | Grad Max: 0.038316 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000200 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001093 | Grad Max: 0.003177 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000082 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000290 | Grad Max: 0.000908 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000420 | Grad Max: 0.001290 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005694 | Grad Max: 0.005694 [GRADIENT NORM TOTAL] 3.6159 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.667 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7057205 0.29427952] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.059 [MASKS] A(Pass/Fail): 705/1343 | B: 599/1449 | C: 393/1655 [LOSS Ex1] A: 0.64838 | B: 0.64098 | C: 0.64035 [LOGITS Ex2 A] Mean Abs: 1.947 | Max: 5.915 [LOSS Ex2] A: 0.14079 | B: 0.35017 | C: 0.29085 ** [JOINT LOSS] ** : 0.903841 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004685 | Grad Max: 0.130553 -> Layer: shared_layers.0.bias | Grad Mean: 0.197897 | Grad Max: 0.966388 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.006042 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001527 | Grad Max: 0.001527 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001245 | Grad Max: 0.389328 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022473 | Grad Max: 2.174060 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000165 | Grad Max: 0.004661 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009534 | Grad Max: 0.042430 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000294 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002164 | Grad Max: 0.005115 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000123 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000609 | Grad Max: 0.001509 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000685 | Grad Max: 0.001629 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013044 | Grad Max: 0.013044 [GRADIENT NORM TOTAL] 4.6607 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.745 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50069124 0.4993088 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.058 [MASKS] A(Pass/Fail): 696/1352 | B: 558/1298 | C: 406/1642 [LOSS Ex1] A: 0.65421 | B: 0.64502 | C: 0.63765 [LOGITS Ex2 A] Mean Abs: 1.946 | Max: 5.481 [LOSS Ex2] A: 0.13448 | B: 0.34876 | C: 0.27595 ** [JOINT LOSS] ** : 0.898690 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005944 | Grad Max: 0.213114 -> Layer: shared_layers.0.bias | Grad Mean: 0.145051 | Grad Max: 0.563141 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002147 | Grad Max: 0.005859 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005584 | Grad Max: 0.005584 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001103 | Grad Max: 0.184625 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018790 | Grad Max: 1.040065 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000150 | Grad Max: 0.005950 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008269 | Grad Max: 0.041985 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000302 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001968 | Grad Max: 0.005086 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000121 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000537 | Grad Max: 0.001340 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000641 | Grad Max: 0.001684 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011161 | Grad Max: 0.011161 [GRADIENT NORM TOTAL] 3.0996 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.506 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6649985 0.33500153] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.059 [MASKS] A(Pass/Fail): 679/1369 | B: 599/1449 | C: 370/1678 [LOSS Ex1] A: 0.65070 | B: 0.64408 | C: 0.63885 [LOGITS Ex2 A] Mean Abs: 1.978 | Max: 6.287 [LOSS Ex2] A: 0.16039 | B: 0.36822 | C: 0.27675 ** [JOINT LOSS] ** : 0.912999 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002830 | Grad Max: 0.118146 -> Layer: shared_layers.0.bias | Grad Mean: 0.262147 | Grad Max: 1.452426 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.005812 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002593 | Grad Max: 0.002593 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001681 | Grad Max: 0.225177 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030313 | Grad Max: 1.259806 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000241 | Grad Max: 0.009159 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014337 | Grad Max: 0.077252 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000370 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002948 | Grad Max: 0.006972 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000159 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000792 | Grad Max: 0.001782 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000736 | Grad Max: 0.001864 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015946 | Grad Max: 0.015946 [GRADIENT NORM TOTAL] 5.6206 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.591 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6028875 0.3971125] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.059 [MASKS] A(Pass/Fail): 571/1045 | B: 596/1452 | C: 386/1662 [LOSS Ex1] A: 0.64912 | B: 0.64486 | C: 0.63745 [LOGITS Ex2 A] Mean Abs: 2.026 | Max: 5.637 [LOSS Ex2] A: 0.14448 | B: 0.36928 | C: 0.29259 ** [JOINT LOSS] ** : 0.912592 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003281 | Grad Max: 0.111977 -> Layer: shared_layers.0.bias | Grad Mean: 0.259810 | Grad Max: 1.460370 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.007363 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011635 | Grad Max: 0.011635 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001676 | Grad Max: 0.238693 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030947 | Grad Max: 1.329581 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000238 | Grad Max: 0.009488 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014061 | Grad Max: 0.076562 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000370 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003004 | Grad Max: 0.007365 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000180 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000792 | Grad Max: 0.001974 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000688 | Grad Max: 0.001690 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015019 | Grad Max: 0.015019 [GRADIENT NORM TOTAL] 5.6018 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.747 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50719315 0.4928069 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.060 [MASKS] A(Pass/Fail): 688/1360 | B: 599/1449 | C: 387/1661 [LOSS Ex1] A: 0.64960 | B: 0.64078 | C: 0.63962 [LOGITS Ex2 A] Mean Abs: 1.977 | Max: 6.453 [LOSS Ex2] A: 0.13588 | B: 0.35103 | C: 0.27466 ** [JOINT LOSS] ** : 0.897193 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.076278 -> Layer: shared_layers.0.bias | Grad Mean: 0.058168 | Grad Max: 0.365573 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002324 | Grad Max: 0.006440 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008998 | Grad Max: 0.008998 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000540 | Grad Max: 0.102542 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008568 | Grad Max: 0.570350 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.003281 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001913 | Grad Max: 0.016460 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000145 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000292 | Grad Max: 0.001889 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000049 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000080 | Grad Max: 0.000404 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000370 | Grad Max: 0.000852 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001007 | Grad Max: 0.001008 [GRADIENT NORM TOTAL] 1.6182 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.699 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5107295 0.4892705] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.059 [MASKS] A(Pass/Fail): 685/1363 | B: 562/1294 | C: 389/1659 [LOSS Ex1] A: 0.64699 | B: 0.64482 | C: 0.63766 [LOGITS Ex2 A] Mean Abs: 1.968 | Max: 6.306 [LOSS Ex2] A: 0.15022 | B: 0.35787 | C: 0.28439 ** [JOINT LOSS] ** : 0.907317 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002475 | Grad Max: 0.092894 -> Layer: shared_layers.0.bias | Grad Mean: 0.289832 | Grad Max: 1.308128 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002357 | Grad Max: 0.006933 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010784 | Grad Max: 0.010784 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001835 | Grad Max: 0.270818 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033500 | Grad Max: 1.533499 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000272 | Grad Max: 0.008661 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016221 | Grad Max: 0.078715 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000362 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003385 | Grad Max: 0.007425 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000192 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000912 | Grad Max: 0.002377 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000884 | Grad Max: 0.001740 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018793 | Grad Max: 0.018793 [GRADIENT NORM TOTAL] 6.0356 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.721 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029926 0.4970075] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.059 [MASKS] A(Pass/Fail): 681/1367 | B: 601/1447 | C: 381/1667 [LOSS Ex1] A: 0.64599 | B: 0.64387 | C: 0.64104 [LOGITS Ex2 A] Mean Abs: 1.946 | Max: 6.509 [LOSS Ex2] A: 0.16317 | B: 0.37254 | C: 0.26890 ** [JOINT LOSS] ** : 0.911834 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004198 | Grad Max: 0.159009 -> Layer: shared_layers.0.bias | Grad Mean: 0.166618 | Grad Max: 0.805899 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006132 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002876 | Grad Max: 0.002876 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001223 | Grad Max: 0.179237 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020725 | Grad Max: 0.983743 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000136 | Grad Max: 0.006416 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007177 | Grad Max: 0.047912 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000197 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001282 | Grad Max: 0.003457 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000086 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000347 | Grad Max: 0.000983 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000337 | Grad Max: 0.001212 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007611 | Grad Max: 0.007611 [GRADIENT NORM TOTAL] 3.7205 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.596 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5033652 0.49663472] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.058 [MASKS] A(Pass/Fail): 656/1392 | B: 596/1452 | C: 397/1651 [LOSS Ex1] A: 0.65341 | B: 0.64464 | C: 0.63768 [LOGITS Ex2 A] Mean Abs: 1.973 | Max: 6.014 [LOSS Ex2] A: 0.14457 | B: 0.36320 | C: 0.26705 ** [JOINT LOSS] ** : 0.903513 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004667 | Grad Max: 0.165724 -> Layer: shared_layers.0.bias | Grad Mean: 0.311302 | Grad Max: 1.239429 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.006191 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009281 | Grad Max: 0.009281 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.258932 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037417 | Grad Max: 1.352906 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000301 | Grad Max: 0.010277 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017693 | Grad Max: 0.097316 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000413 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003746 | Grad Max: 0.007739 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000205 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001007 | Grad Max: 0.002382 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001013 | Grad Max: 0.002500 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020760 | Grad Max: 0.020760 [GRADIENT NORM TOTAL] 6.3133 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.526 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5402258 0.45977423] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.546 | Std: 0.057 [MASKS] A(Pass/Fail): 655/1393 | B: 602/1446 | C: 394/1654 [LOSS Ex1] A: 0.65328 | B: 0.64054 | C: 0.63582 [LOGITS Ex2 A] Mean Abs: 1.953 | Max: 6.100 [LOSS Ex2] A: 0.15073 | B: 0.34671 | C: 0.26770 ** [JOINT LOSS] ** : 0.898262 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005095 | Grad Max: 0.155621 -> Layer: shared_layers.0.bias | Grad Mean: 0.302388 | Grad Max: 1.264640 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002227 | Grad Max: 0.006490 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008039 | Grad Max: 0.008039 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001891 | Grad Max: 0.279816 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034770 | Grad Max: 1.565925 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000278 | Grad Max: 0.010086 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016451 | Grad Max: 0.097131 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000447 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003492 | Grad Max: 0.008180 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000194 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000948 | Grad Max: 0.002222 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000978 | Grad Max: 0.002398 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020091 | Grad Max: 0.020091 [GRADIENT NORM TOTAL] 6.0364 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.672 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.70726484 0.2927352 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.059 [MASKS] A(Pass/Fail): 705/1343 | B: 565/1291 | C: 266/1110 [LOSS Ex1] A: 0.64810 | B: 0.64459 | C: 0.63629 [LOGITS Ex2 A] Mean Abs: 1.970 | Max: 6.173 [LOSS Ex2] A: 0.14260 | B: 0.35212 | C: 0.27431 ** [JOINT LOSS] ** : 0.899335 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002973 | Grad Max: 0.112155 -> Layer: shared_layers.0.bias | Grad Mean: 0.132821 | Grad Max: 0.566331 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002257 | Grad Max: 0.006189 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004689 | Grad Max: 0.004689 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001041 | Grad Max: 0.358536 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018461 | Grad Max: 2.006120 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.006851 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007170 | Grad Max: 0.054539 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000266 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001626 | Grad Max: 0.004293 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000107 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000449 | Grad Max: 0.001206 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000439 | Grad Max: 0.001245 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009509 | Grad Max: 0.009509 [GRADIENT NORM TOTAL] 3.6678 [EPOCH SUMMARY] Train Loss: 0.9046 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8896 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 103/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.750 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50072 0.49927992] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.059 [MASKS] A(Pass/Fail): 696/1352 | B: 601/1447 | C: 390/1658 [LOSS Ex1] A: 0.65397 | B: 0.64364 | C: 0.63669 [LOGITS Ex2 A] Mean Abs: 1.978 | Max: 5.963 [LOSS Ex2] A: 0.14136 | B: 0.37331 | C: 0.26805 ** [JOINT LOSS] ** : 0.905669 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005188 | Grad Max: 0.219298 -> Layer: shared_layers.0.bias | Grad Mean: 0.229160 | Grad Max: 1.057505 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002145 | Grad Max: 0.005716 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001392 | Grad Max: 0.001392 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001711 | Grad Max: 0.395138 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029703 | Grad Max: 2.218769 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000238 | Grad Max: 0.007632 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013732 | Grad Max: 0.063056 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000382 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003161 | Grad Max: 0.006686 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000178 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000866 | Grad Max: 0.001895 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000930 | Grad Max: 0.001767 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018332 | Grad Max: 0.018332 [GRADIENT NORM TOTAL] 5.2681 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.511 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6662289 0.33377114] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.059 [MASKS] A(Pass/Fail): 680/1368 | B: 597/1451 | C: 405/1643 [LOSS Ex1] A: 0.65044 | B: 0.64442 | C: 0.63570 [LOGITS Ex2 A] Mean Abs: 1.988 | Max: 6.119 [LOSS Ex2] A: 0.15996 | B: 0.36915 | C: 0.27335 ** [JOINT LOSS] ** : 0.911005 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004463 | Grad Max: 0.138877 -> Layer: shared_layers.0.bias | Grad Mean: 0.134711 | Grad Max: 0.716225 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.006167 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005090 | Grad Max: 0.005090 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000950 | Grad Max: 0.153516 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015125 | Grad Max: 0.853783 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000069 | Grad Max: 0.003578 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002549 | Grad Max: 0.025396 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000156 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000334 | Grad Max: 0.002053 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000067 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000532 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000182 | Grad Max: 0.000769 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000786 | Grad Max: 0.000786 [GRADIENT NORM TOTAL] 2.9057 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.596 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6035885 0.39641148] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.060 [MASKS] A(Pass/Fail): 572/1044 | B: 601/1447 | C: 394/1654 [LOSS Ex1] A: 0.64885 | B: 0.64033 | C: 0.64003 [LOGITS Ex2 A] Mean Abs: 2.039 | Max: 6.038 [LOSS Ex2] A: 0.14186 | B: 0.35085 | C: 0.27799 ** [JOINT LOSS] ** : 0.899971 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002835 | Grad Max: 0.123176 -> Layer: shared_layers.0.bias | Grad Mean: 0.277488 | Grad Max: 1.442962 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002196 | Grad Max: 0.006205 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002332 | Grad Max: 0.002332 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001847 | Grad Max: 0.257997 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034112 | Grad Max: 1.437707 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000272 | Grad Max: 0.008907 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016353 | Grad Max: 0.076895 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000382 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003430 | Grad Max: 0.007625 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000182 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000914 | Grad Max: 0.002052 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000823 | Grad Max: 0.002196 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017896 | Grad Max: 0.017896 [GRADIENT NORM TOTAL] 5.9448 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.752 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071637 0.49283633] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.060 [MASKS] A(Pass/Fail): 688/1360 | B: 567/1289 | C: 393/1655 [LOSS Ex1] A: 0.64933 | B: 0.64438 | C: 0.63894 [LOGITS Ex2 A] Mean Abs: 1.983 | Max: 6.919 [LOSS Ex2] A: 0.15272 | B: 0.34066 | C: 0.27086 ** [JOINT LOSS] ** : 0.898962 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003426 | Grad Max: 0.114550 -> Layer: shared_layers.0.bias | Grad Mean: 0.079115 | Grad Max: 0.366179 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.005753 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000625 | Grad Max: 0.000625 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000743 | Grad Max: 0.156935 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011933 | Grad Max: 0.864668 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.004067 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002268 | Grad Max: 0.031933 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000176 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000307 | Grad Max: 0.001756 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000053 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000080 | Grad Max: 0.000504 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000323 | Grad Max: 0.000783 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000405 | Grad Max: 0.000405 [GRADIENT NORM TOTAL] 2.3473 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.704 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108048 0.48919517] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.060 [MASKS] A(Pass/Fail): 686/1362 | B: 601/1447 | C: 359/1689 [LOSS Ex1] A: 0.64670 | B: 0.64344 | C: 0.64020 [LOGITS Ex2 A] Mean Abs: 1.982 | Max: 6.067 [LOSS Ex2] A: 0.14536 | B: 0.37028 | C: 0.28133 ** [JOINT LOSS] ** : 0.909107 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002923 | Grad Max: 0.068328 -> Layer: shared_layers.0.bias | Grad Mean: 0.152453 | Grad Max: 0.895642 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002331 | Grad Max: 0.006859 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009981 | Grad Max: 0.009981 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001048 | Grad Max: 0.251468 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018348 | Grad Max: 1.420308 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000094 | Grad Max: 0.005849 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005035 | Grad Max: 0.044751 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000181 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000965 | Grad Max: 0.002972 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000085 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000258 | Grad Max: 0.000780 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000260 | Grad Max: 0.000965 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005330 | Grad Max: 0.005330 [GRADIENT NORM TOTAL] 3.8771 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.727 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029448 0.49705514] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.059 [MASKS] A(Pass/Fail): 682/1366 | B: 597/1451 | C: 393/1655 [LOSS Ex1] A: 0.64569 | B: 0.64422 | C: 0.63854 [LOGITS Ex2 A] Mean Abs: 1.960 | Max: 6.686 [LOSS Ex2] A: 0.16570 | B: 0.36607 | C: 0.26022 ** [JOINT LOSS] ** : 0.906814 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003706 | Grad Max: 0.173327 -> Layer: shared_layers.0.bias | Grad Mean: 0.056491 | Grad Max: 0.200452 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002215 | Grad Max: 0.006378 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001037 | Grad Max: 0.001037 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000658 | Grad Max: 0.132926 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009710 | Grad Max: 0.668221 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.004304 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002088 | Grad Max: 0.024815 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000172 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000301 | Grad Max: 0.001671 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000052 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000078 | Grad Max: 0.000488 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000274 | Grad Max: 0.000718 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000416 | Grad Max: 0.000416 [GRADIENT NORM TOTAL] 1.7678 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.600 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50325227 0.49674776] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.059 [MASKS] A(Pass/Fail): 658/1390 | B: 601/1447 | C: 378/1670 [LOSS Ex1] A: 0.65316 | B: 0.64011 | C: 0.63757 [LOGITS Ex2 A] Mean Abs: 1.939 | Max: 7.053 [LOSS Ex2] A: 0.14079 | B: 0.33969 | C: 0.28114 ** [JOINT LOSS] ** : 0.897484 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002263 | Grad Max: 0.069403 -> Layer: shared_layers.0.bias | Grad Mean: 0.152091 | Grad Max: 0.864043 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.006006 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004968 | Grad Max: 0.004968 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000971 | Grad Max: 0.210445 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017594 | Grad Max: 1.179626 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000118 | Grad Max: 0.005105 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006911 | Grad Max: 0.043056 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000226 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001459 | Grad Max: 0.003874 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000113 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000391 | Grad Max: 0.001110 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000416 | Grad Max: 0.001528 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007625 | Grad Max: 0.007625 [GRADIENT NORM TOTAL] 3.4867 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.103 | Max: 0.531 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54053634 0.45946366] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.546 | Std: 0.057 [MASKS] A(Pass/Fail): 655/1393 | B: 567/1289 | C: 421/1627 [LOSS Ex1] A: 0.65303 | B: 0.64416 | C: 0.63811 [LOGITS Ex2 A] Mean Abs: 1.939 | Max: 6.828 [LOSS Ex2] A: 0.15037 | B: 0.34318 | C: 0.27800 ** [JOINT LOSS] ** : 0.902278 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005490 | Grad Max: 0.235252 -> Layer: shared_layers.0.bias | Grad Mean: 0.092015 | Grad Max: 0.540924 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005360 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003172 | Grad Max: 0.003172 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000888 | Grad Max: 0.108699 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014374 | Grad Max: 0.551466 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000092 | Grad Max: 0.004696 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004320 | Grad Max: 0.031101 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000259 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001052 | Grad Max: 0.003524 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000096 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000271 | Grad Max: 0.000854 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000352 | Grad Max: 0.001104 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005302 | Grad Max: 0.005302 [GRADIENT NORM TOTAL] 2.3113 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.678 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7090002 0.29099977] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.060 [MASKS] A(Pass/Fail): 706/1342 | B: 601/1447 | C: 389/1659 [LOSS Ex1] A: 0.64780 | B: 0.64321 | C: 0.63832 [LOGITS Ex2 A] Mean Abs: 1.972 | Max: 6.666 [LOSS Ex2] A: 0.14128 | B: 0.37028 | C: 0.29045 ** [JOINT LOSS] ** : 0.910443 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002924 | Grad Max: 0.089796 -> Layer: shared_layers.0.bias | Grad Mean: 0.055570 | Grad Max: 0.349306 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002321 | Grad Max: 0.006718 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010544 | Grad Max: 0.010544 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000686 | Grad Max: 0.193945 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011150 | Grad Max: 1.097914 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.004884 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002403 | Grad Max: 0.031624 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000145 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000345 | Grad Max: 0.002020 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000064 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000091 | Grad Max: 0.000429 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000204 | Grad Max: 0.000661 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001009 | Grad Max: 0.001009 [GRADIENT NORM TOTAL] 2.1235 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.756 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008049 0.4991951] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.059 [MASKS] A(Pass/Fail): 696/1352 | B: 597/1451 | C: 411/1637 [LOSS Ex1] A: 0.65370 | B: 0.64397 | C: 0.63513 [LOGITS Ex2 A] Mean Abs: 1.992 | Max: 5.746 [LOSS Ex2] A: 0.13689 | B: 0.36674 | C: 0.28135 ** [JOINT LOSS] ** : 0.905926 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002462 | Grad Max: 0.069731 -> Layer: shared_layers.0.bias | Grad Mean: 0.128123 | Grad Max: 0.485318 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.005973 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004295 | Grad Max: 0.004295 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000752 | Grad Max: 0.396218 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012876 | Grad Max: 2.218232 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.002739 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002092 | Grad Max: 0.019925 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000130 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000490 | Grad Max: 0.002518 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000066 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000156 | Grad Max: 0.000587 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000315 | Grad Max: 0.001142 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004663 | Grad Max: 0.004663 [GRADIENT NORM TOTAL] 3.7820 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.516 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.667576 0.33242396] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.060 [MASKS] A(Pass/Fail): 680/1368 | B: 601/1447 | C: 414/1634 [LOSS Ex1] A: 0.65013 | B: 0.63985 | C: 0.63603 [LOGITS Ex2 A] Mean Abs: 1.976 | Max: 5.900 [LOSS Ex2] A: 0.15768 | B: 0.35488 | C: 0.26650 ** [JOINT LOSS] ** : 0.901693 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002270 | Grad Max: 0.069968 -> Layer: shared_layers.0.bias | Grad Mean: 0.080019 | Grad Max: 0.330233 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002259 | Grad Max: 0.005960 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003209 | Grad Max: 0.003209 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000623 | Grad Max: 0.136639 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010687 | Grad Max: 0.751850 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.005665 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003851 | Grad Max: 0.031288 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000192 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000774 | Grad Max: 0.002874 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000082 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000208 | Grad Max: 0.000858 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000282 | Grad Max: 0.001092 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004069 | Grad Max: 0.004069 [GRADIENT NORM TOTAL] 2.0871 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.602 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6043871 0.39561287] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.060 [MASKS] A(Pass/Fail): 572/1044 | B: 567/1289 | C: 429/1619 [LOSS Ex1] A: 0.64853 | B: 0.64390 | C: 0.63693 [LOGITS Ex2 A] Mean Abs: 2.009 | Max: 6.914 [LOSS Ex2] A: 0.13935 | B: 0.35484 | C: 0.26107 ** [JOINT LOSS] ** : 0.894871 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002907 | Grad Max: 0.070416 -> Layer: shared_layers.0.bias | Grad Mean: 0.199217 | Grad Max: 0.857371 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.006989 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010613 | Grad Max: 0.010613 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001344 | Grad Max: 0.207113 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024207 | Grad Max: 1.152167 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000207 | Grad Max: 0.008929 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012354 | Grad Max: 0.075131 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000356 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002506 | Grad Max: 0.006893 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000144 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000670 | Grad Max: 0.001659 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000597 | Grad Max: 0.001726 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013272 | Grad Max: 0.013272 [GRADIENT NORM TOTAL] 4.2563 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.758 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50708926 0.49291074] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.061 [MASKS] A(Pass/Fail): 688/1360 | B: 601/1447 | C: 403/1645 [LOSS Ex1] A: 0.64901 | B: 0.64295 | C: 0.63949 [LOGITS Ex2 A] Mean Abs: 1.994 | Max: 6.992 [LOSS Ex2] A: 0.14263 | B: 0.36589 | C: 0.25640 ** [JOINT LOSS] ** : 0.898787 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005063 | Grad Max: 0.202942 -> Layer: shared_layers.0.bias | Grad Mean: 0.091441 | Grad Max: 0.534830 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.005812 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000213 | Grad Max: 0.000213 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000849 | Grad Max: 0.130432 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013742 | Grad Max: 0.692998 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.005572 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004234 | Grad Max: 0.025086 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000304 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001021 | Grad Max: 0.003928 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000101 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000288 | Grad Max: 0.000896 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000393 | Grad Max: 0.001421 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006169 | Grad Max: 0.006169 [GRADIENT NORM TOTAL] 2.2044 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.710 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108839 0.48911604] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.060 [MASKS] A(Pass/Fail): 687/1361 | B: 598/1450 | C: 286/1090 [LOSS Ex1] A: 0.64634 | B: 0.64372 | C: 0.63967 [LOGITS Ex2 A] Mean Abs: 1.962 | Max: 6.217 [LOSS Ex2] A: 0.14570 | B: 0.36469 | C: 0.27514 ** [JOINT LOSS] ** : 0.905085 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002823 | Grad Max: 0.066891 -> Layer: shared_layers.0.bias | Grad Mean: 0.057118 | Grad Max: 0.313494 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002285 | Grad Max: 0.006415 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007699 | Grad Max: 0.007699 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000655 | Grad Max: 0.164286 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010558 | Grad Max: 0.930329 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.004057 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001955 | Grad Max: 0.026105 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000169 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000297 | Grad Max: 0.001982 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000060 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000078 | Grad Max: 0.000514 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000270 | Grad Max: 0.000914 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000512 | Grad Max: 0.000512 [GRADIENT NORM TOTAL] 1.9672 [EPOCH SUMMARY] Train Loss: 0.9034 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8840 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8880 -> New: 0.8840) ############################## EPOCH 104/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.734 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.502902 0.497098] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.060 [MASKS] A(Pass/Fail): 682/1366 | B: 602/1446 | C: 405/1643 [LOSS Ex1] A: 0.64533 | B: 0.63959 | C: 0.63750 [LOGITS Ex2 A] Mean Abs: 1.953 | Max: 7.330 [LOSS Ex2] A: 0.15402 | B: 0.35331 | C: 0.27128 ** [JOINT LOSS] ** : 0.900344 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003099 | Grad Max: 0.087777 -> Layer: shared_layers.0.bias | Grad Mean: 0.092034 | Grad Max: 0.359698 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002300 | Grad Max: 0.006389 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000040 | Grad Max: 0.000040 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000675 | Grad Max: 0.095445 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012206 | Grad Max: 0.535767 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000080 | Grad Max: 0.004242 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004247 | Grad Max: 0.031215 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000159 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000880 | Grad Max: 0.003413 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000086 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000238 | Grad Max: 0.000825 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000309 | Grad Max: 0.001109 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004860 | Grad Max: 0.004860 [GRADIENT NORM TOTAL] 2.0599 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.606 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5031084 0.49689165] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.547 | Std: 0.059 [MASKS] A(Pass/Fail): 658/1390 | B: 567/1289 | C: 390/1658 [LOSS Ex1] A: 0.65284 | B: 0.64363 | C: 0.64317 [LOGITS Ex2 A] Mean Abs: 1.947 | Max: 5.971 [LOSS Ex2] A: 0.14324 | B: 0.34613 | C: 0.27993 ** [JOINT LOSS] ** : 0.902981 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003929 | Grad Max: 0.161977 -> Layer: shared_layers.0.bias | Grad Mean: 0.146415 | Grad Max: 0.910827 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.005469 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004495 | Grad Max: 0.004495 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001140 | Grad Max: 0.184266 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019408 | Grad Max: 1.031261 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.006530 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005082 | Grad Max: 0.060516 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000195 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000879 | Grad Max: 0.003408 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000086 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000229 | Grad Max: 0.000872 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000304 | Grad Max: 0.000975 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004648 | Grad Max: 0.004648 [GRADIENT NORM TOTAL] 3.5023 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.104 | Max: 0.537 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5408156 0.4591844] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.547 | Std: 0.057 [MASKS] A(Pass/Fail): 657/1391 | B: 601/1447 | C: 406/1642 [LOSS Ex1] A: 0.65271 | B: 0.64267 | C: 0.63959 [LOGITS Ex2 A] Mean Abs: 1.951 | Max: 6.587 [LOSS Ex2] A: 0.15244 | B: 0.37201 | C: 0.28489 ** [JOINT LOSS] ** : 0.914773 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004154 | Grad Max: 0.167241 -> Layer: shared_layers.0.bias | Grad Mean: 0.173882 | Grad Max: 0.794187 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.005944 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007523 | Grad Max: 0.007523 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001322 | Grad Max: 0.197952 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022387 | Grad Max: 1.102213 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000145 | Grad Max: 0.007074 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007755 | Grad Max: 0.054545 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000239 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001359 | Grad Max: 0.004852 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000099 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000337 | Grad Max: 0.001061 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000278 | Grad Max: 0.001029 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006293 | Grad Max: 0.006293 [GRADIENT NORM TOTAL] 3.9320 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.685 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.71106994 0.2889301 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.061 [MASKS] A(Pass/Fail): 706/1342 | B: 601/1447 | C: 425/1623 [LOSS Ex1] A: 0.64743 | B: 0.64344 | C: 0.63393 [LOGITS Ex2 A] Mean Abs: 1.971 | Max: 6.002 [LOSS Ex2] A: 0.14182 | B: 0.37578 | C: 0.26150 ** [JOINT LOSS] ** : 0.901300 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003642 | Grad Max: 0.104679 -> Layer: shared_layers.0.bias | Grad Mean: 0.191379 | Grad Max: 0.888431 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002270 | Grad Max: 0.006157 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003866 | Grad Max: 0.003866 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001263 | Grad Max: 0.395372 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022607 | Grad Max: 2.204231 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.006067 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009875 | Grad Max: 0.052292 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000305 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002214 | Grad Max: 0.005122 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000141 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000599 | Grad Max: 0.001378 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000661 | Grad Max: 0.001566 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012605 | Grad Max: 0.012605 [GRADIENT NORM TOTAL] 4.5130 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.763 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008894 0.49911052] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.060 [MASKS] A(Pass/Fail): 696/1352 | B: 602/1446 | C: 404/1644 [LOSS Ex1] A: 0.65337 | B: 0.63931 | C: 0.63764 [LOGITS Ex2 A] Mean Abs: 1.970 | Max: 6.229 [LOSS Ex2] A: 0.13149 | B: 0.35117 | C: 0.25214 ** [JOINT LOSS] ** : 0.888379 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003039 | Grad Max: 0.087151 -> Layer: shared_layers.0.bias | Grad Mean: 0.120192 | Grad Max: 0.511599 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002245 | Grad Max: 0.005838 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007384 | Grad Max: 0.007384 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000747 | Grad Max: 0.376162 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012557 | Grad Max: 2.111840 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.003745 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003044 | Grad Max: 0.025813 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000181 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000718 | Grad Max: 0.003045 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000083 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000186 | Grad Max: 0.000838 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000489 | Grad Max: 0.001330 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003225 | Grad Max: 0.003225 [GRADIENT NORM TOTAL] 3.5166 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.521 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.66910523 0.33089474] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.060 [MASKS] A(Pass/Fail): 680/1368 | B: 567/1289 | C: 422/1626 [LOSS Ex1] A: 0.64978 | B: 0.64337 | C: 0.63635 [LOGITS Ex2 A] Mean Abs: 1.982 | Max: 5.633 [LOSS Ex2] A: 0.15079 | B: 0.34647 | C: 0.25988 ** [JOINT LOSS] ** : 0.895542 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002289 | Grad Max: 0.053555 -> Layer: shared_layers.0.bias | Grad Mean: 0.088992 | Grad Max: 0.366005 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.005891 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000374 | Grad Max: 0.000374 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000794 | Grad Max: 0.133179 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013852 | Grad Max: 0.739968 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000106 | Grad Max: 0.004627 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006271 | Grad Max: 0.032488 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000257 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001386 | Grad Max: 0.004014 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000106 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000387 | Grad Max: 0.001123 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000466 | Grad Max: 0.001632 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009312 | Grad Max: 0.009312 [GRADIENT NORM TOTAL] 2.1900 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.608 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.605253 0.39474696] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.061 [MASKS] A(Pass/Fail): 574/1042 | B: 601/1447 | C: 408/1640 [LOSS Ex1] A: 0.64817 | B: 0.64241 | C: 0.63716 [LOGITS Ex2 A] Mean Abs: 2.013 | Max: 5.872 [LOSS Ex2] A: 0.13761 | B: 0.37463 | C: 0.26603 ** [JOINT LOSS] ** : 0.902007 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002982 | Grad Max: 0.071280 -> Layer: shared_layers.0.bias | Grad Mean: 0.201082 | Grad Max: 1.006298 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006224 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003989 | Grad Max: 0.003989 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001343 | Grad Max: 0.209530 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024124 | Grad Max: 1.171321 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000161 | Grad Max: 0.007359 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009625 | Grad Max: 0.058842 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000304 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001941 | Grad Max: 0.005090 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000128 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000524 | Grad Max: 0.001431 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000518 | Grad Max: 0.001409 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011007 | Grad Max: 0.011007 [GRADIENT NORM TOTAL] 4.4694 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.765 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50704044 0.49295953] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.061 [MASKS] A(Pass/Fail): 689/1359 | B: 601/1447 | C: 427/1621 [LOSS Ex1] A: 0.64865 | B: 0.64318 | C: 0.63420 [LOGITS Ex2 A] Mean Abs: 1.995 | Max: 7.835 [LOSS Ex2] A: 0.13577 | B: 0.36923 | C: 0.28137 ** [JOINT LOSS] ** : 0.904134 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002968 | Grad Max: 0.134296 -> Layer: shared_layers.0.bias | Grad Mean: 0.075863 | Grad Max: 0.246819 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002179 | Grad Max: 0.005887 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002618 | Grad Max: 0.002618 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000645 | Grad Max: 0.214752 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010206 | Grad Max: 1.150349 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.002713 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001882 | Grad Max: 0.016636 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000153 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000330 | Grad Max: 0.002075 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000064 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000091 | Grad Max: 0.000444 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000232 | Grad Max: 0.000825 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001329 | Grad Max: 0.001329 [GRADIENT NORM TOTAL] 2.2321 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.718 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5109122 0.4890878] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.061 [MASKS] A(Pass/Fail): 688/1360 | B: 602/1446 | C: 393/1655 [LOSS Ex1] A: 0.64596 | B: 0.63905 | C: 0.64058 [LOGITS Ex2 A] Mean Abs: 2.006 | Max: 5.911 [LOSS Ex2] A: 0.13924 | B: 0.34992 | C: 0.27839 ** [JOINT LOSS] ** : 0.897711 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003370 | Grad Max: 0.068272 -> Layer: shared_layers.0.bias | Grad Mean: 0.168987 | Grad Max: 0.850505 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002297 | Grad Max: 0.006476 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006598 | Grad Max: 0.006598 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001215 | Grad Max: 0.215851 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021921 | Grad Max: 1.206809 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000159 | Grad Max: 0.006874 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009358 | Grad Max: 0.055348 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000293 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002034 | Grad Max: 0.005283 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000123 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000543 | Grad Max: 0.001404 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000559 | Grad Max: 0.001507 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011172 | Grad Max: 0.011172 [GRADIENT NORM TOTAL] 3.7387 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.741 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50292873 0.49707127] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.060 [MASKS] A(Pass/Fail): 684/1364 | B: 567/1289 | C: 422/1626 [LOSS Ex1] A: 0.64495 | B: 0.64311 | C: 0.63211 [LOGITS Ex2 A] Mean Abs: 1.963 | Max: 7.008 [LOSS Ex2] A: 0.15270 | B: 0.34867 | C: 0.24313 ** [JOINT LOSS] ** : 0.888222 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003121 | Grad Max: 0.077247 -> Layer: shared_layers.0.bias | Grad Mean: 0.155333 | Grad Max: 0.595657 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002313 | Grad Max: 0.006604 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001280 | Grad Max: 0.001280 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001087 | Grad Max: 0.099529 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019096 | Grad Max: 0.548450 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.008283 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009777 | Grad Max: 0.070187 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000300 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001948 | Grad Max: 0.005215 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000117 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000500 | Grad Max: 0.001349 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000438 | Grad Max: 0.001458 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008657 | Grad Max: 0.008657 [GRADIENT NORM TOTAL] 3.0962 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.613 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50301236 0.49698767] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.060 [MASKS] A(Pass/Fail): 660/1388 | B: 601/1447 | C: 434/1614 [LOSS Ex1] A: 0.65251 | B: 0.64215 | C: 0.63375 [LOGITS Ex2 A] Mean Abs: 1.960 | Max: 5.906 [LOSS Ex2] A: 0.14764 | B: 0.37423 | C: 0.28112 ** [JOINT LOSS] ** : 0.910465 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001942 | Grad Max: 0.041045 -> Layer: shared_layers.0.bias | Grad Mean: 0.082610 | Grad Max: 0.392511 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.006876 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010629 | Grad Max: 0.010629 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000720 | Grad Max: 0.152706 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012473 | Grad Max: 0.842636 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000059 | Grad Max: 0.004775 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002874 | Grad Max: 0.027660 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000170 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000482 | Grad Max: 0.002282 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000067 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000109 | Grad Max: 0.000631 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000191 | Grad Max: 0.000670 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000547 | Grad Max: 0.000547 [GRADIENT NORM TOTAL] 2.4893 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.105 | Max: 0.543 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409642 0.45903584] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.547 | Std: 0.058 [MASKS] A(Pass/Fail): 661/1387 | B: 601/1447 | C: 414/1634 [LOSS Ex1] A: 0.65239 | B: 0.64292 | C: 0.63764 [LOGITS Ex2 A] Mean Abs: 1.954 | Max: 5.865 [LOSS Ex2] A: 0.15213 | B: 0.36697 | C: 0.25630 ** [JOINT LOSS] ** : 0.902780 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003750 | Grad Max: 0.147732 -> Layer: shared_layers.0.bias | Grad Mean: 0.117029 | Grad Max: 0.477745 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.006725 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011117 | Grad Max: 0.011117 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000964 | Grad Max: 0.154731 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015965 | Grad Max: 0.854128 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000066 | Grad Max: 0.003823 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002392 | Grad Max: 0.031446 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000127 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000317 | Grad Max: 0.001819 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000065 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000523 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000282 | Grad Max: 0.000712 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000889 | Grad Max: 0.000889 [GRADIENT NORM TOTAL] 2.8621 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.691 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7130228 0.28697714] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.061 [MASKS] A(Pass/Fail): 708/1340 | B: 602/1446 | C: 416/1632 [LOSS Ex1] A: 0.64708 | B: 0.63878 | C: 0.63703 [LOGITS Ex2 A] Mean Abs: 1.993 | Max: 6.524 [LOSS Ex2] A: 0.13816 | B: 0.34663 | C: 0.26482 ** [JOINT LOSS] ** : 0.890834 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.042926 -> Layer: shared_layers.0.bias | Grad Mean: 0.108781 | Grad Max: 0.437262 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002361 | Grad Max: 0.006639 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009480 | Grad Max: 0.009480 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000832 | Grad Max: 0.327847 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014933 | Grad Max: 1.831866 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.004138 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004692 | Grad Max: 0.034681 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000186 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000951 | Grad Max: 0.003162 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000084 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000253 | Grad Max: 0.000791 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000363 | Grad Max: 0.001087 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004415 | Grad Max: 0.004415 [GRADIENT NORM TOTAL] 3.4236 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.770 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009303 0.49906972] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.060 [MASKS] A(Pass/Fail): 697/1351 | B: 567/1289 | C: 272/1104 [LOSS Ex1] A: 0.65304 | B: 0.64285 | C: 0.63800 [LOGITS Ex2 A] Mean Abs: 1.991 | Max: 5.957 [LOSS Ex2] A: 0.13570 | B: 0.36123 | C: 0.29365 ** [JOINT LOSS] ** : 0.908159 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005506 | Grad Max: 0.169706 -> Layer: shared_layers.0.bias | Grad Mean: 0.282008 | Grad Max: 1.068803 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005618 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003732 | Grad Max: 0.003732 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001921 | Grad Max: 0.401939 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034872 | Grad Max: 2.245164 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.010675 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016433 | Grad Max: 0.083245 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000392 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003628 | Grad Max: 0.007835 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000218 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000966 | Grad Max: 0.002578 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000954 | Grad Max: 0.001936 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019408 | Grad Max: 0.019408 [GRADIENT NORM TOTAL] 6.1825 [EPOCH SUMMARY] Train Loss: 0.9005 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8813 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8840 -> New: 0.8813) ############################## EPOCH 105/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.527 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6705793 0.32942063] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.061 [MASKS] A(Pass/Fail): 681/1367 | B: 603/1445 | C: 406/1642 [LOSS Ex1] A: 0.64943 | B: 0.64191 | C: 0.63500 [LOGITS Ex2 A] Mean Abs: 1.991 | Max: 6.205 [LOSS Ex2] A: 0.14938 | B: 0.36587 | C: 0.27548 ** [JOINT LOSS] ** : 0.905688 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002999 | Grad Max: 0.086351 -> Layer: shared_layers.0.bias | Grad Mean: 0.159070 | Grad Max: 0.675847 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.006203 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000596 | Grad Max: 0.000596 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001058 | Grad Max: 0.144057 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018836 | Grad Max: 0.775138 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000139 | Grad Max: 0.006549 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008185 | Grad Max: 0.048351 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000282 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001822 | Grad Max: 0.004792 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000122 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000492 | Grad Max: 0.001361 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000486 | Grad Max: 0.001165 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009928 | Grad Max: 0.009928 [GRADIENT NORM TOTAL] 3.1712 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.614 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6060094 0.39399058] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.061 [MASKS] A(Pass/Fail): 576/1040 | B: 602/1446 | C: 433/1615 [LOSS Ex1] A: 0.64782 | B: 0.64268 | C: 0.63424 [LOGITS Ex2 A] Mean Abs: 2.067 | Max: 6.491 [LOSS Ex2] A: 0.14135 | B: 0.36790 | C: 0.27029 ** [JOINT LOSS] ** : 0.901429 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007452 | Grad Max: 0.202066 -> Layer: shared_layers.0.bias | Grad Mean: 0.477778 | Grad Max: 2.311432 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002194 | Grad Max: 0.006524 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005937 | Grad Max: 0.005937 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003188 | Grad Max: 0.368864 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058916 | Grad Max: 2.062617 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000472 | Grad Max: 0.015984 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028523 | Grad Max: 0.140471 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000646 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006132 | Grad Max: 0.012467 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000304 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001605 | Grad Max: 0.003878 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001458 | Grad Max: 0.002930 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030594 | Grad Max: 0.030594 [GRADIENT NORM TOTAL] 9.8399 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.773 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070545 0.4929455] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.062 [MASKS] A(Pass/Fail): 690/1358 | B: 603/1445 | C: 422/1626 [LOSS Ex1] A: 0.64830 | B: 0.63854 | C: 0.63492 [LOGITS Ex2 A] Mean Abs: 2.043 | Max: 6.941 [LOSS Ex2] A: 0.14671 | B: 0.35574 | C: 0.28943 ** [JOINT LOSS] ** : 0.904544 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008898 | Grad Max: 0.271319 -> Layer: shared_layers.0.bias | Grad Mean: 0.510641 | Grad Max: 2.365273 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.005658 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001142 | Grad Max: 0.001142 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003472 | Grad Max: 0.404797 -> Layer: exit2_layers.0.bias | Grad Mean: 0.063632 | Grad Max: 2.261539 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000507 | Grad Max: 0.017577 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030354 | Grad Max: 0.170910 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000725 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006550 | Grad Max: 0.013795 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000332 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001717 | Grad Max: 0.003925 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001629 | Grad Max: 0.003266 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033472 | Grad Max: 0.033472 [GRADIENT NORM TOTAL] 10.5019 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.725 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5109345 0.48906556] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.061 [MASKS] A(Pass/Fail): 690/1358 | B: 567/1289 | C: 406/1642 [LOSS Ex1] A: 0.64559 | B: 0.64263 | C: 0.63689 [LOGITS Ex2 A] Mean Abs: 2.030 | Max: 6.960 [LOSS Ex2] A: 0.13781 | B: 0.33962 | C: 0.28694 ** [JOINT LOSS] ** : 0.896497 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005052 | Grad Max: 0.136568 -> Layer: shared_layers.0.bias | Grad Mean: 0.132034 | Grad Max: 0.643784 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002300 | Grad Max: 0.006530 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007833 | Grad Max: 0.007833 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001028 | Grad Max: 0.167035 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018137 | Grad Max: 0.932139 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000142 | Grad Max: 0.004209 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008024 | Grad Max: 0.035266 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000276 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001820 | Grad Max: 0.004585 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000131 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000495 | Grad Max: 0.001390 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000560 | Grad Max: 0.001398 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010773 | Grad Max: 0.010773 [GRADIENT NORM TOTAL] 3.0135 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.748 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5029956 0.4970044] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.061 [MASKS] A(Pass/Fail): 684/1364 | B: 604/1444 | C: 411/1637 [LOSS Ex1] A: 0.64459 | B: 0.64170 | C: 0.63942 [LOGITS Ex2 A] Mean Abs: 1.930 | Max: 6.900 [LOSS Ex2] A: 0.15843 | B: 0.40906 | C: 0.28181 ** [JOINT LOSS] ** : 0.924999 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006659 | Grad Max: 0.228085 -> Layer: shared_layers.0.bias | Grad Mean: 0.723277 | Grad Max: 3.110711 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002205 | Grad Max: 0.007058 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008381 | Grad Max: 0.008381 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004458 | Grad Max: 0.653719 -> Layer: exit2_layers.0.bias | Grad Mean: 0.083726 | Grad Max: 3.683842 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000694 | Grad Max: 0.025008 -> Layer: exit2_layers.3.bias | Grad Mean: 0.042519 | Grad Max: 0.244467 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000987 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008792 | Grad Max: 0.018090 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000420 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002320 | Grad Max: 0.005475 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002243 | Grad Max: 0.004364 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046979 | Grad Max: 0.046979 [GRADIENT NORM TOTAL] 14.5055 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.619 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50299233 0.49700767] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.060 [MASKS] A(Pass/Fail): 660/1388 | B: 602/1446 | C: 405/1643 [LOSS Ex1] A: 0.65219 | B: 0.64248 | C: 0.63804 [LOGITS Ex2 A] Mean Abs: 1.903 | Max: 6.366 [LOSS Ex2] A: 0.14474 | B: 0.41638 | C: 0.27885 ** [JOINT LOSS] ** : 0.924224 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011520 | Grad Max: 0.264813 -> Layer: shared_layers.0.bias | Grad Mean: 0.902009 | Grad Max: 3.676590 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.005800 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003673 | Grad Max: 0.003673 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005761 | Grad Max: 0.737653 -> Layer: exit2_layers.0.bias | Grad Mean: 0.108282 | Grad Max: 4.156323 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000905 | Grad Max: 0.030505 -> Layer: exit2_layers.3.bias | Grad Mean: 0.055080 | Grad Max: 0.282614 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001216 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011627 | Grad Max: 0.023436 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000563 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003065 | Grad Max: 0.007174 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002976 | Grad Max: 0.005624 -> Layer: exit2_layers.12.bias | Grad Mean: 0.061562 | Grad Max: 0.061562 [GRADIENT NORM TOTAL] 17.9378 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.548 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5411448 0.45885518] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.547 | Std: 0.058 [MASKS] A(Pass/Fail): 661/1387 | B: 603/1445 | C: 391/1657 [LOSS Ex1] A: 0.65208 | B: 0.63834 | C: 0.63899 [LOGITS Ex2 A] Mean Abs: 1.893 | Max: 5.985 [LOSS Ex2] A: 0.15113 | B: 0.37881 | C: 0.27447 ** [JOINT LOSS] ** : 0.911276 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008997 | Grad Max: 0.226028 -> Layer: shared_layers.0.bias | Grad Mean: 0.577809 | Grad Max: 2.362106 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002140 | Grad Max: 0.005544 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004387 | Grad Max: 0.004387 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003742 | Grad Max: 0.452272 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070308 | Grad Max: 2.415835 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000595 | Grad Max: 0.019633 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036093 | Grad Max: 0.194101 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000831 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007638 | Grad Max: 0.016456 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000376 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002008 | Grad Max: 0.004732 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001993 | Grad Max: 0.003560 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040007 | Grad Max: 0.040007 [GRADIENT NORM TOTAL] 11.3614 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.698 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.714679 0.28532094] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.061 [MASKS] A(Pass/Fail): 708/1340 | B: 567/1289 | C: 401/1647 [LOSS Ex1] A: 0.64677 | B: 0.64245 | C: 0.63862 [LOGITS Ex2 A] Mean Abs: 2.002 | Max: 6.132 [LOSS Ex2] A: 0.13640 | B: 0.34827 | C: 0.28876 ** [JOINT LOSS] ** : 0.900422 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002271 | Grad Max: 0.056541 -> Layer: shared_layers.0.bias | Grad Mean: 0.081321 | Grad Max: 0.407096 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002194 | Grad Max: 0.006243 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004607 | Grad Max: 0.004607 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000714 | Grad Max: 0.141690 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012403 | Grad Max: 0.798022 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.004265 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003887 | Grad Max: 0.034846 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000156 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000786 | Grad Max: 0.003213 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000209 | Grad Max: 0.000791 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000223 | Grad Max: 0.000939 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003877 | Grad Max: 0.003877 [GRADIENT NORM TOTAL] 2.3834 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.777 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50092936 0.4990706 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.061 [MASKS] A(Pass/Fail): 697/1351 | B: 605/1443 | C: 406/1642 [LOSS Ex1] A: 0.65277 | B: 0.64152 | C: 0.63584 [LOGITS Ex2 A] Mean Abs: 2.023 | Max: 5.830 [LOSS Ex2] A: 0.13078 | B: 0.37968 | C: 0.27281 ** [JOINT LOSS] ** : 0.904465 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004181 | Grad Max: 0.157812 -> Layer: shared_layers.0.bias | Grad Mean: 0.424656 | Grad Max: 1.921044 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.006037 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005813 | Grad Max: 0.005813 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002739 | Grad Max: 0.341466 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050985 | Grad Max: 1.884191 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000433 | Grad Max: 0.015987 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026777 | Grad Max: 0.153787 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000564 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005609 | Grad Max: 0.011757 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000295 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001475 | Grad Max: 0.003783 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001329 | Grad Max: 0.002776 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029201 | Grad Max: 0.029201 [GRADIENT NORM TOTAL] 8.6682 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.531 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.67173374 0.32826623] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.061 [MASKS] A(Pass/Fail): 681/1367 | B: 602/1446 | C: 413/1635 [LOSS Ex1] A: 0.64915 | B: 0.64230 | C: 0.63693 [LOGITS Ex2 A] Mean Abs: 2.001 | Max: 5.397 [LOSS Ex2] A: 0.16017 | B: 0.36239 | C: 0.25805 ** [JOINT LOSS] ** : 0.902997 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004289 | Grad Max: 0.116330 -> Layer: shared_layers.0.bias | Grad Mean: 0.300202 | Grad Max: 1.264248 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.005599 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001828 | Grad Max: 0.001828 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.341771 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035231 | Grad Max: 1.898149 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000274 | Grad Max: 0.010792 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016852 | Grad Max: 0.101600 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000354 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003421 | Grad Max: 0.007528 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000162 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000903 | Grad Max: 0.002209 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000752 | Grad Max: 0.001896 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017901 | Grad Max: 0.017901 [GRADIENT NORM TOTAL] 6.3194 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.619 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.60671204 0.39328793] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.062 [MASKS] A(Pass/Fail): 577/1039 | B: 603/1445 | C: 416/1632 [LOSS Ex1] A: 0.64754 | B: 0.63816 | C: 0.63519 [LOGITS Ex2 A] Mean Abs: 2.012 | Max: 5.858 [LOSS Ex2] A: 0.14788 | B: 0.34520 | C: 0.26841 ** [JOINT LOSS] ** : 0.894130 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005707 | Grad Max: 0.173097 -> Layer: shared_layers.0.bias | Grad Mean: 0.277185 | Grad Max: 1.108382 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002275 | Grad Max: 0.006760 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007223 | Grad Max: 0.007223 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001829 | Grad Max: 0.515995 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033450 | Grad Max: 2.889395 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000255 | Grad Max: 0.010928 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015121 | Grad Max: 0.091395 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000412 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003270 | Grad Max: 0.007566 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000197 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000866 | Grad Max: 0.002185 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000906 | Grad Max: 0.002160 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017843 | Grad Max: 0.017843 [GRADIENT NORM TOTAL] 6.2330 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.778 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070422 0.49295774] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.062 [MASKS] A(Pass/Fail): 692/1356 | B: 569/1287 | C: 394/1654 [LOSS Ex1] A: 0.64804 | B: 0.64227 | C: 0.63887 [LOGITS Ex2 A] Mean Abs: 1.993 | Max: 6.620 [LOSS Ex2] A: 0.14167 | B: 0.35229 | C: 0.27703 ** [JOINT LOSS] ** : 0.900058 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005610 | Grad Max: 0.195088 -> Layer: shared_layers.0.bias | Grad Mean: 0.308712 | Grad Max: 1.369542 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002271 | Grad Max: 0.006415 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011016 | Grad Max: 0.011016 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.420056 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037771 | Grad Max: 2.350629 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.010529 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017136 | Grad Max: 0.083133 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000414 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003718 | Grad Max: 0.007961 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000190 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000992 | Grad Max: 0.002367 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000946 | Grad Max: 0.002091 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019934 | Grad Max: 0.019934 [GRADIENT NORM TOTAL] 6.6271 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.730 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5109994 0.4890006] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.062 [MASKS] A(Pass/Fail): 690/1358 | B: 605/1443 | C: 441/1607 [LOSS Ex1] A: 0.64533 | B: 0.64135 | C: 0.63220 [LOGITS Ex2 A] Mean Abs: 1.987 | Max: 5.925 [LOSS Ex2] A: 0.14608 | B: 0.36132 | C: 0.25167 ** [JOINT LOSS] ** : 0.892648 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001728 | Grad Max: 0.038921 -> Layer: shared_layers.0.bias | Grad Mean: 0.101603 | Grad Max: 0.583791 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002285 | Grad Max: 0.006359 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004938 | Grad Max: 0.004938 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000676 | Grad Max: 0.216822 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011912 | Grad Max: 1.221963 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.004297 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004063 | Grad Max: 0.027800 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000214 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000768 | Grad Max: 0.003573 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000073 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000197 | Grad Max: 0.000773 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000386 | Grad Max: 0.001214 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003707 | Grad Max: 0.003707 [GRADIENT NORM TOTAL] 2.5906 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.753 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.502986 0.497014] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.061 [MASKS] A(Pass/Fail): 684/1364 | B: 603/1445 | C: 327/1049 [LOSS Ex1] A: 0.64435 | B: 0.64213 | C: 0.62956 [LOGITS Ex2 A] Mean Abs: 2.022 | Max: 6.570 [LOSS Ex2] A: 0.16313 | B: 0.37554 | C: 0.23716 ** [JOINT LOSS] ** : 0.897287 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007993 | Grad Max: 0.261216 -> Layer: shared_layers.0.bias | Grad Mean: 0.505051 | Grad Max: 2.303274 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.007127 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001136 | Grad Max: 0.001136 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003427 | Grad Max: 0.438750 -> Layer: exit2_layers.0.bias | Grad Mean: 0.062319 | Grad Max: 2.448532 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000521 | Grad Max: 0.015888 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031593 | Grad Max: 0.154444 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000673 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006905 | Grad Max: 0.014321 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000310 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001855 | Grad Max: 0.003990 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001730 | Grad Max: 0.003597 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037145 | Grad Max: 0.037145 [GRADIENT NORM TOTAL] 10.4053 [EPOCH SUMMARY] Train Loss: 0.9043 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8901 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 106/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.623 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50290793 0.49709207] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.060 [MASKS] A(Pass/Fail): 660/1388 | B: 603/1445 | C: 430/1618 [LOSS Ex1] A: 0.65198 | B: 0.63799 | C: 0.63322 [LOGITS Ex2 A] Mean Abs: 1.996 | Max: 6.214 [LOSS Ex2] A: 0.14142 | B: 0.35373 | C: 0.28787 ** [JOINT LOSS] ** : 0.902068 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007735 | Grad Max: 0.205423 -> Layer: shared_layers.0.bias | Grad Mean: 0.549919 | Grad Max: 2.513360 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002228 | Grad Max: 0.006726 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011023 | Grad Max: 0.011023 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003505 | Grad Max: 0.465356 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064933 | Grad Max: 2.607980 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000531 | Grad Max: 0.019639 -> Layer: exit2_layers.3.bias | Grad Mean: 0.032447 | Grad Max: 0.183136 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000718 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006930 | Grad Max: 0.014430 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000362 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001828 | Grad Max: 0.004503 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001607 | Grad Max: 0.003142 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034971 | Grad Max: 0.034971 [GRADIENT NORM TOTAL] 11.2645 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.552 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5412331 0.4587669] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.547 | Std: 0.059 [MASKS] A(Pass/Fail): 662/1386 | B: 569/1287 | C: 450/1598 [LOSS Ex1] A: 0.65188 | B: 0.64211 | C: 0.63691 [LOGITS Ex2 A] Mean Abs: 1.931 | Max: 5.882 [LOSS Ex2] A: 0.14641 | B: 0.34000 | C: 0.24851 ** [JOINT LOSS] ** : 0.888607 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003888 | Grad Max: 0.113714 -> Layer: shared_layers.0.bias | Grad Mean: 0.098523 | Grad Max: 0.485299 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002075 | Grad Max: 0.005718 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003794 | Grad Max: 0.003794 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000851 | Grad Max: 0.139101 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014576 | Grad Max: 0.766629 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000093 | Grad Max: 0.004791 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005078 | Grad Max: 0.036864 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000178 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001193 | Grad Max: 0.003700 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000335 | Grad Max: 0.001022 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000524 | Grad Max: 0.001740 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007653 | Grad Max: 0.007653 [GRADIENT NORM TOTAL] 2.4158 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.702 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7159532 0.2840468] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.062 [MASKS] A(Pass/Fail): 708/1340 | B: 605/1443 | C: 408/1640 [LOSS Ex1] A: 0.64653 | B: 0.64118 | C: 0.63904 [LOGITS Ex2 A] Mean Abs: 1.958 | Max: 6.332 [LOSS Ex2] A: 0.13711 | B: 0.39916 | C: 0.28609 ** [JOINT LOSS] ** : 0.916372 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006666 | Grad Max: 0.202244 -> Layer: shared_layers.0.bias | Grad Mean: 0.596156 | Grad Max: 2.730278 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.006048 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002975 | Grad Max: 0.002975 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003692 | Grad Max: 0.462359 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069274 | Grad Max: 2.560252 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000571 | Grad Max: 0.018688 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035253 | Grad Max: 0.185974 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000732 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007447 | Grad Max: 0.014967 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000375 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001974 | Grad Max: 0.004756 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001860 | Grad Max: 0.003592 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039313 | Grad Max: 0.039313 [GRADIENT NORM TOTAL] 11.8024 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.781 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50098455 0.49901545] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.061 [MASKS] A(Pass/Fail): 699/1349 | B: 604/1444 | C: 434/1614 [LOSS Ex1] A: 0.65256 | B: 0.64196 | C: 0.63893 [LOGITS Ex2 A] Mean Abs: 1.945 | Max: 5.796 [LOSS Ex2] A: 0.13792 | B: 0.39658 | C: 0.29336 ** [JOINT LOSS] ** : 0.920441 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010550 | Grad Max: 0.262212 -> Layer: shared_layers.0.bias | Grad Mean: 0.758947 | Grad Max: 3.079138 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.005470 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001893 | Grad Max: 0.001893 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004866 | Grad Max: 0.566247 -> Layer: exit2_layers.0.bias | Grad Mean: 0.090617 | Grad Max: 3.164000 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000771 | Grad Max: 0.025905 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047369 | Grad Max: 0.267800 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000096 | Grad Max: 0.000993 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010147 | Grad Max: 0.020384 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000443 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002696 | Grad Max: 0.006007 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002619 | Grad Max: 0.005069 -> Layer: exit2_layers.12.bias | Grad Mean: 0.054420 | Grad Max: 0.054420 [GRADIENT NORM TOTAL] 14.7533 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.535 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.67262954 0.32737046] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.062 [MASKS] A(Pass/Fail): 683/1365 | B: 604/1444 | C: 429/1619 [LOSS Ex1] A: 0.64893 | B: 0.63783 | C: 0.63682 [LOGITS Ex2 A] Mean Abs: 1.952 | Max: 6.116 [LOSS Ex2] A: 0.15427 | B: 0.35974 | C: 0.26836 ** [JOINT LOSS] ** : 0.901983 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007448 | Grad Max: 0.225885 -> Layer: shared_layers.0.bias | Grad Mean: 0.396249 | Grad Max: 1.493262 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.006249 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002768 | Grad Max: 0.002768 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002552 | Grad Max: 0.287423 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046782 | Grad Max: 1.444238 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000399 | Grad Max: 0.012117 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024243 | Grad Max: 0.115778 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000620 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005232 | Grad Max: 0.011567 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000270 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001375 | Grad Max: 0.003365 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001336 | Grad Max: 0.002602 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027369 | Grad Max: 0.027369 [GRADIENT NORM TOTAL] 7.5145 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.623 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6071847 0.39281532] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.062 [MASKS] A(Pass/Fail): 577/1039 | B: 569/1287 | C: 417/1631 [LOSS Ex1] A: 0.64732 | B: 0.64194 | C: 0.63910 [LOGITS Ex2 A] Mean Abs: 2.055 | Max: 7.247 [LOSS Ex2] A: 0.13565 | B: 0.34695 | C: 0.27356 ** [JOINT LOSS] ** : 0.894841 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003373 | Grad Max: 0.128613 -> Layer: shared_layers.0.bias | Grad Mean: 0.264962 | Grad Max: 1.506265 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.005882 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003555 | Grad Max: 0.003555 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001779 | Grad Max: 0.253834 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031933 | Grad Max: 1.378420 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000241 | Grad Max: 0.009609 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014647 | Grad Max: 0.078670 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000415 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002935 | Grad Max: 0.007541 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000167 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000764 | Grad Max: 0.001909 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000655 | Grad Max: 0.001714 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015016 | Grad Max: 0.015016 [GRADIENT NORM TOTAL] 5.8616 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.783 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5070288 0.49297115] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.062 [MASKS] A(Pass/Fail): 693/1355 | B: 606/1442 | C: 447/1601 [LOSS Ex1] A: 0.64783 | B: 0.64103 | C: 0.63475 [LOGITS Ex2 A] Mean Abs: 2.033 | Max: 6.367 [LOSS Ex2] A: 0.14316 | B: 0.38510 | C: 0.27357 ** [JOINT LOSS] ** : 0.908478 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006251 | Grad Max: 0.208559 -> Layer: shared_layers.0.bias | Grad Mean: 0.583420 | Grad Max: 2.768114 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.005814 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001406 | Grad Max: 0.001406 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003683 | Grad Max: 0.426443 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069402 | Grad Max: 2.347536 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000580 | Grad Max: 0.021534 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035887 | Grad Max: 0.195433 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000793 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007504 | Grad Max: 0.015661 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000329 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001961 | Grad Max: 0.004358 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001767 | Grad Max: 0.003372 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037481 | Grad Max: 0.037481 [GRADIENT NORM TOTAL] 11.9518 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.735 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51094633 0.4890536 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.062 [MASKS] A(Pass/Fail): 690/1358 | B: 604/1444 | C: 413/1635 [LOSS Ex1] A: 0.64510 | B: 0.64182 | C: 0.63776 [LOGITS Ex2 A] Mean Abs: 2.033 | Max: 5.786 [LOSS Ex2] A: 0.15097 | B: 0.36758 | C: 0.26195 ** [JOINT LOSS] ** : 0.901728 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005237 | Grad Max: 0.187891 -> Layer: shared_layers.0.bias | Grad Mean: 0.458817 | Grad Max: 2.354578 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002239 | Grad Max: 0.006271 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004028 | Grad Max: 0.004028 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002909 | Grad Max: 0.355134 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054464 | Grad Max: 1.998014 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.015701 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027518 | Grad Max: 0.155586 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000652 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005743 | Grad Max: 0.012412 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000269 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001520 | Grad Max: 0.003620 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001393 | Grad Max: 0.002982 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029916 | Grad Max: 0.029916 [GRADIENT NORM TOTAL] 9.6202 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.758 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50302607 0.4969739 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.062 [MASKS] A(Pass/Fail): 684/1364 | B: 605/1443 | C: 447/1601 [LOSS Ex1] A: 0.64412 | B: 0.63769 | C: 0.63249 [LOGITS Ex2 A] Mean Abs: 1.988 | Max: 6.564 [LOSS Ex2] A: 0.15425 | B: 0.35069 | C: 0.24053 ** [JOINT LOSS] ** : 0.886591 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.046923 -> Layer: shared_layers.0.bias | Grad Mean: 0.136690 | Grad Max: 0.584834 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002383 | Grad Max: 0.006514 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003009 | Grad Max: 0.003009 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000940 | Grad Max: 0.383743 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016637 | Grad Max: 2.148637 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.003992 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005348 | Grad Max: 0.032229 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000182 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001095 | Grad Max: 0.003683 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000087 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000289 | Grad Max: 0.000863 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000349 | Grad Max: 0.001114 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005809 | Grad Max: 0.005809 [GRADIENT NORM TOTAL] 3.8840 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.626 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5028864 0.49711356] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.548 | Std: 0.061 [MASKS] A(Pass/Fail): 662/1386 | B: 569/1287 | C: 441/1607 [LOSS Ex1] A: 0.65178 | B: 0.64181 | C: 0.63365 [LOGITS Ex2 A] Mean Abs: 1.931 | Max: 6.411 [LOSS Ex2] A: 0.14385 | B: 0.35678 | C: 0.27425 ** [JOINT LOSS] ** : 0.900707 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005909 | Grad Max: 0.144674 -> Layer: shared_layers.0.bias | Grad Mean: 0.459312 | Grad Max: 1.988287 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.005858 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006472 | Grad Max: 0.006472 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002915 | Grad Max: 0.464472 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054475 | Grad Max: 2.600086 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000440 | Grad Max: 0.015749 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027235 | Grad Max: 0.161057 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000581 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005819 | Grad Max: 0.011864 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000301 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001546 | Grad Max: 0.003731 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001507 | Grad Max: 0.002833 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031042 | Grad Max: 0.031042 [GRADIENT NORM TOTAL] 9.3580 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.106 | Max: 0.555 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54129666 0.45870334] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.059 [MASKS] A(Pass/Fail): 662/1386 | B: 608/1440 | C: 420/1628 [LOSS Ex1] A: 0.65169 | B: 0.64090 | C: 0.63691 [LOGITS Ex2 A] Mean Abs: 1.930 | Max: 5.970 [LOSS Ex2] A: 0.14682 | B: 0.37510 | C: 0.26786 ** [JOINT LOSS] ** : 0.906427 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003825 | Grad Max: 0.105519 -> Layer: shared_layers.0.bias | Grad Mean: 0.272900 | Grad Max: 1.479888 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002115 | Grad Max: 0.005814 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000238 | Grad Max: 0.000238 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001807 | Grad Max: 0.257144 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032965 | Grad Max: 1.423256 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000253 | Grad Max: 0.009375 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015602 | Grad Max: 0.084971 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000392 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003365 | Grad Max: 0.007513 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000183 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000898 | Grad Max: 0.001995 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000841 | Grad Max: 0.001860 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017968 | Grad Max: 0.017968 [GRADIENT NORM TOTAL] 5.7688 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.706 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.71697605 0.28302395] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.062 [MASKS] A(Pass/Fail): 710/1338 | B: 605/1443 | C: 453/1595 [LOSS Ex1] A: 0.64633 | B: 0.64169 | C: 0.63270 [LOGITS Ex2 A] Mean Abs: 2.014 | Max: 6.617 [LOSS Ex2] A: 0.13875 | B: 0.36197 | C: 0.26125 ** [JOINT LOSS] ** : 0.894230 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005385 | Grad Max: 0.146922 -> Layer: shared_layers.0.bias | Grad Mean: 0.349903 | Grad Max: 1.247234 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002291 | Grad Max: 0.006191 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005873 | Grad Max: 0.005873 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.339470 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042502 | Grad Max: 1.907193 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.011711 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020376 | Grad Max: 0.114601 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000473 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004343 | Grad Max: 0.009834 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000226 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001140 | Grad Max: 0.002798 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000991 | Grad Max: 0.002237 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020975 | Grad Max: 0.020975 [GRADIENT NORM TOTAL] 7.1898 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.786 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50093734 0.49906266] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.061 [MASKS] A(Pass/Fail): 699/1349 | B: 605/1443 | C: 447/1601 [LOSS Ex1] A: 0.65238 | B: 0.63756 | C: 0.63555 [LOGITS Ex2 A] Mean Abs: 2.023 | Max: 5.664 [LOSS Ex2] A: 0.13594 | B: 0.34700 | C: 0.26856 ** [JOINT LOSS] ** : 0.892331 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005078 | Grad Max: 0.132285 -> Layer: shared_layers.0.bias | Grad Mean: 0.407875 | Grad Max: 1.653904 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.005760 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002434 | Grad Max: 0.002434 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002602 | Grad Max: 0.402678 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048633 | Grad Max: 2.253958 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000395 | Grad Max: 0.014259 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024592 | Grad Max: 0.140473 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000558 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005202 | Grad Max: 0.011112 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000263 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001378 | Grad Max: 0.003349 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001285 | Grad Max: 0.002996 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027231 | Grad Max: 0.027231 [GRADIENT NORM TOTAL] 8.3461 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.538 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6733552 0.32664478] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.062 [MASKS] A(Pass/Fail): 683/1365 | B: 569/1287 | C: 331/1045 [LOSS Ex1] A: 0.64873 | B: 0.64169 | C: 0.62964 [LOGITS Ex2 A] Mean Abs: 1.995 | Max: 6.022 [LOSS Ex2] A: 0.15172 | B: 0.34906 | C: 0.26923 ** [JOINT LOSS] ** : 0.896690 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001896 | Grad Max: 0.043161 -> Layer: shared_layers.0.bias | Grad Mean: 0.057338 | Grad Max: 0.284761 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002230 | Grad Max: 0.006251 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000902 | Grad Max: 0.000902 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000550 | Grad Max: 0.150429 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009289 | Grad Max: 0.843301 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.003400 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002346 | Grad Max: 0.018381 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000143 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000448 | Grad Max: 0.002320 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000059 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000122 | Grad Max: 0.000556 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000214 | Grad Max: 0.000730 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001354 | Grad Max: 0.001354 [GRADIENT NORM TOTAL] 1.8810 [EPOCH SUMMARY] Train Loss: 0.9008 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8797 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8813 -> New: 0.8797) ############################## EPOCH 107/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.626 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6075865 0.39241353] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.062 [MASKS] A(Pass/Fail): 578/1038 | B: 608/1440 | C: 463/1585 [LOSS Ex1] A: 0.64712 | B: 0.64077 | C: 0.63346 [LOGITS Ex2 A] Mean Abs: 2.009 | Max: 6.424 [LOSS Ex2] A: 0.13753 | B: 0.37468 | C: 0.24130 ** [JOINT LOSS] ** : 0.891616 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003727 | Grad Max: 0.097203 -> Layer: shared_layers.0.bias | Grad Mean: 0.226806 | Grad Max: 0.905685 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.005855 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003308 | Grad Max: 0.003308 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001561 | Grad Max: 0.225247 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028439 | Grad Max: 1.249406 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000226 | Grad Max: 0.009900 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013889 | Grad Max: 0.092289 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000413 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003023 | Grad Max: 0.006628 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000170 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000807 | Grad Max: 0.002052 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000775 | Grad Max: 0.001858 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015965 | Grad Max: 0.015965 [GRADIENT NORM TOTAL] 4.7310 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.787 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50706255 0.49293745] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.063 [MASKS] A(Pass/Fail): 694/1354 | B: 605/1443 | C: 432/1616 [LOSS Ex1] A: 0.64763 | B: 0.64155 | C: 0.63755 [LOGITS Ex2 A] Mean Abs: 2.007 | Max: 6.342 [LOSS Ex2] A: 0.13789 | B: 0.36515 | C: 0.25942 ** [JOINT LOSS] ** : 0.896393 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001863 | Grad Max: 0.098911 -> Layer: shared_layers.0.bias | Grad Mean: 0.056185 | Grad Max: 0.309922 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002170 | Grad Max: 0.005994 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001920 | Grad Max: 0.001920 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000490 | Grad Max: 0.103453 -> Layer: exit2_layers.0.bias | Grad Mean: 0.008134 | Grad Max: 0.582226 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.002797 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001684 | Grad Max: 0.017922 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000124 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000294 | Grad Max: 0.001930 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000059 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000074 | Grad Max: 0.000480 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000339 | Grad Max: 0.000870 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000281 | Grad Max: 0.000281 [GRADIENT NORM TOTAL] 1.6406 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.739 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5109316 0.48906842] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.062 [MASKS] A(Pass/Fail): 690/1358 | B: 605/1443 | C: 462/1586 [LOSS Ex1] A: 0.64488 | B: 0.63739 | C: 0.63203 [LOGITS Ex2 A] Mean Abs: 2.015 | Max: 6.477 [LOSS Ex2] A: 0.14333 | B: 0.34206 | C: 0.27288 ** [JOINT LOSS] ** : 0.890861 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004413 | Grad Max: 0.110087 -> Layer: shared_layers.0.bias | Grad Mean: 0.225452 | Grad Max: 0.946322 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002435 | Grad Max: 0.006858 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009392 | Grad Max: 0.009392 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001566 | Grad Max: 0.297597 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028712 | Grad Max: 1.667716 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000234 | Grad Max: 0.008437 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014186 | Grad Max: 0.069135 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000385 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003084 | Grad Max: 0.006922 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000187 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000816 | Grad Max: 0.002064 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000780 | Grad Max: 0.002138 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016297 | Grad Max: 0.016297 [GRADIENT NORM TOTAL] 4.9064 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.762 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50304985 0.49695015] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.062 [MASKS] A(Pass/Fail): 684/1364 | B: 570/1286 | C: 471/1577 [LOSS Ex1] A: 0.64389 | B: 0.64151 | C: 0.63366 [LOGITS Ex2 A] Mean Abs: 1.969 | Max: 5.804 [LOSS Ex2] A: 0.15585 | B: 0.34996 | C: 0.25910 ** [JOINT LOSS] ** : 0.894657 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002610 | Grad Max: 0.086777 -> Layer: shared_layers.0.bias | Grad Mean: 0.140579 | Grad Max: 0.591293 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.006325 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003217 | Grad Max: 0.003217 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000907 | Grad Max: 0.332006 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015670 | Grad Max: 1.855617 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000089 | Grad Max: 0.004134 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005036 | Grad Max: 0.034448 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000183 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000927 | Grad Max: 0.003278 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000084 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000238 | Grad Max: 0.000946 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000255 | Grad Max: 0.000947 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004334 | Grad Max: 0.004334 [GRADIENT NORM TOTAL] 3.3972 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.630 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5028401 0.49715987] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.061 [MASKS] A(Pass/Fail): 662/1386 | B: 608/1440 | C: 448/1600 [LOSS Ex1] A: 0.65157 | B: 0.64058 | C: 0.63575 [LOGITS Ex2 A] Mean Abs: 1.954 | Max: 6.808 [LOSS Ex2] A: 0.13375 | B: 0.36745 | C: 0.26984 ** [JOINT LOSS] ** : 0.899647 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002504 | Grad Max: 0.060996 -> Layer: shared_layers.0.bias | Grad Mean: 0.094363 | Grad Max: 0.488117 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.006087 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007844 | Grad Max: 0.007844 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000692 | Grad Max: 0.129941 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011815 | Grad Max: 0.725151 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000081 | Grad Max: 0.004402 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004554 | Grad Max: 0.036387 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000193 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000976 | Grad Max: 0.003221 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000082 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000285 | Grad Max: 0.000798 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000375 | Grad Max: 0.001288 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007148 | Grad Max: 0.007148 [GRADIENT NORM TOTAL] 2.2021 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.559 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54148585 0.45851412] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.059 [MASKS] A(Pass/Fail): 662/1386 | B: 607/1441 | C: 468/1580 [LOSS Ex1] A: 0.65148 | B: 0.64136 | C: 0.63158 [LOGITS Ex2 A] Mean Abs: 1.964 | Max: 6.371 [LOSS Ex2] A: 0.15099 | B: 0.36922 | C: 0.26369 ** [JOINT LOSS] ** : 0.902772 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002915 | Grad Max: 0.064886 -> Layer: shared_layers.0.bias | Grad Mean: 0.182904 | Grad Max: 0.867706 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.007814 -> Layer: exit1_layers.0.bias | Grad Mean: 0.016352 | Grad Max: 0.016352 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001238 | Grad Max: 0.190854 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022459 | Grad Max: 1.065493 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000148 | Grad Max: 0.006805 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009008 | Grad Max: 0.057484 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000248 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001857 | Grad Max: 0.005223 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000114 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000488 | Grad Max: 0.001400 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000460 | Grad Max: 0.001580 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009648 | Grad Max: 0.009648 [GRADIENT NORM TOTAL] 4.0687 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.711 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.71833163 0.28166834] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.062 [MASKS] A(Pass/Fail): 711/1337 | B: 608/1440 | C: 469/1579 [LOSS Ex1] A: 0.64609 | B: 0.63720 | C: 0.63224 [LOGITS Ex2 A] Mean Abs: 2.003 | Max: 6.662 [LOSS Ex2] A: 0.13262 | B: 0.34297 | C: 0.23266 ** [JOINT LOSS] ** : 0.874591 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001635 | Grad Max: 0.037325 -> Layer: shared_layers.0.bias | Grad Mean: 0.096186 | Grad Max: 0.418764 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002304 | Grad Max: 0.006173 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000439 | Grad Max: 0.000439 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000706 | Grad Max: 0.210897 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012349 | Grad Max: 1.196857 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.003380 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003294 | Grad Max: 0.027572 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000180 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000657 | Grad Max: 0.002826 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000065 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000178 | Grad Max: 0.000656 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000508 | Grad Max: 0.001200 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003727 | Grad Max: 0.003727 [GRADIENT NORM TOTAL] 2.6778 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.791 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50092536 0.49907464] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.061 [MASKS] A(Pass/Fail): 699/1349 | B: 570/1286 | C: 466/1582 [LOSS Ex1] A: 0.65215 | B: 0.64133 | C: 0.63564 [LOGITS Ex2 A] Mean Abs: 1.981 | Max: 6.423 [LOSS Ex2] A: 0.12880 | B: 0.34517 | C: 0.26345 ** [JOINT LOSS] ** : 0.888847 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002946 | Grad Max: 0.098270 -> Layer: shared_layers.0.bias | Grad Mean: 0.139688 | Grad Max: 0.586628 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005860 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004731 | Grad Max: 0.004731 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001029 | Grad Max: 0.209463 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018134 | Grad Max: 1.173887 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000128 | Grad Max: 0.005395 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007676 | Grad Max: 0.039797 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000232 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001709 | Grad Max: 0.004539 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000124 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000464 | Grad Max: 0.001461 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000466 | Grad Max: 0.001453 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009130 | Grad Max: 0.009130 [GRADIENT NORM TOTAL] 3.3050 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.542 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.674489 0.32551098] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.062 [MASKS] A(Pass/Fail): 683/1365 | B: 608/1440 | C: 452/1596 [LOSS Ex1] A: 0.64847 | B: 0.64039 | C: 0.63819 [LOGITS Ex2 A] Mean Abs: 1.984 | Max: 6.087 [LOSS Ex2] A: 0.13895 | B: 0.36968 | C: 0.28897 ** [JOINT LOSS] ** : 0.908214 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.055958 -> Layer: shared_layers.0.bias | Grad Mean: 0.081397 | Grad Max: 0.315782 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005931 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002764 | Grad Max: 0.002764 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000633 | Grad Max: 0.210123 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010930 | Grad Max: 1.184491 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.002887 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002457 | Grad Max: 0.020966 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000142 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000531 | Grad Max: 0.002534 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000158 | Grad Max: 0.000577 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000261 | Grad Max: 0.000883 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004361 | Grad Max: 0.004361 [GRADIENT NORM TOTAL] 2.5028 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.631 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.60815537 0.3918446 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.062 [MASKS] A(Pass/Fail): 578/1038 | B: 607/1441 | C: 445/1603 [LOSS Ex1] A: 0.64684 | B: 0.64116 | C: 0.63927 [LOGITS Ex2 A] Mean Abs: 2.026 | Max: 6.434 [LOSS Ex2] A: 0.13996 | B: 0.35614 | C: 0.27824 ** [JOINT LOSS] ** : 0.900536 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003045 | Grad Max: 0.116475 -> Layer: shared_layers.0.bias | Grad Mean: 0.202387 | Grad Max: 1.255913 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.005864 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000854 | Grad Max: 0.000854 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001351 | Grad Max: 0.241260 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024163 | Grad Max: 1.350985 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000158 | Grad Max: 0.004923 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009585 | Grad Max: 0.044032 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000253 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002136 | Grad Max: 0.005199 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000139 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000551 | Grad Max: 0.001623 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000448 | Grad Max: 0.001497 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009973 | Grad Max: 0.009973 [GRADIENT NORM TOTAL] 4.5643 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.793 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071182 0.49288175] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.063 [MASKS] A(Pass/Fail): 695/1353 | B: 608/1440 | C: 435/1613 [LOSS Ex1] A: 0.64734 | B: 0.63699 | C: 0.63510 [LOGITS Ex2 A] Mean Abs: 2.020 | Max: 8.029 [LOSS Ex2] A: 0.13246 | B: 0.33569 | C: 0.27631 ** [JOINT LOSS] ** : 0.887966 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003552 | Grad Max: 0.138063 -> Layer: shared_layers.0.bias | Grad Mean: 0.118398 | Grad Max: 0.479727 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002283 | Grad Max: 0.006035 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004881 | Grad Max: 0.004881 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000971 | Grad Max: 0.167201 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016946 | Grad Max: 0.918232 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000116 | Grad Max: 0.004602 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006824 | Grad Max: 0.033297 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000244 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001558 | Grad Max: 0.004704 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000103 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000414 | Grad Max: 0.001099 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000436 | Grad Max: 0.001462 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008108 | Grad Max: 0.008108 [GRADIENT NORM TOTAL] 2.9013 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.745 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51088524 0.4891148 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.063 [MASKS] A(Pass/Fail): 690/1358 | B: 570/1286 | C: 448/1600 [LOSS Ex1] A: 0.64458 | B: 0.64112 | C: 0.63615 [LOGITS Ex2 A] Mean Abs: 1.970 | Max: 5.546 [LOSS Ex2] A: 0.14369 | B: 0.35594 | C: 0.27471 ** [JOINT LOSS] ** : 0.898727 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003609 | Grad Max: 0.121610 -> Layer: shared_layers.0.bias | Grad Mean: 0.384691 | Grad Max: 1.705843 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002301 | Grad Max: 0.006804 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008879 | Grad Max: 0.008879 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002337 | Grad Max: 0.505654 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043259 | Grad Max: 2.818784 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000339 | Grad Max: 0.011650 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021276 | Grad Max: 0.119902 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000542 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004410 | Grad Max: 0.009250 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000213 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001155 | Grad Max: 0.002683 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001016 | Grad Max: 0.002085 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022419 | Grad Max: 0.022419 [GRADIENT NORM TOTAL] 8.0343 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.768 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50315887 0.49684113] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.062 [MASKS] A(Pass/Fail): 684/1364 | B: 608/1440 | C: 455/1593 [LOSS Ex1] A: 0.64358 | B: 0.64018 | C: 0.63643 [LOGITS Ex2 A] Mean Abs: 1.963 | Max: 7.228 [LOSS Ex2] A: 0.15715 | B: 0.37997 | C: 0.26795 ** [JOINT LOSS] ** : 0.908427 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004347 | Grad Max: 0.159167 -> Layer: shared_layers.0.bias | Grad Mean: 0.417892 | Grad Max: 2.163827 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002256 | Grad Max: 0.006317 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003501 | Grad Max: 0.003501 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002640 | Grad Max: 0.521056 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049372 | Grad Max: 2.905117 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000377 | Grad Max: 0.012853 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023601 | Grad Max: 0.127272 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000500 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004956 | Grad Max: 0.010163 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000252 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001323 | Grad Max: 0.003128 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001280 | Grad Max: 0.002380 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026908 | Grad Max: 0.026908 [GRADIENT NORM TOTAL] 9.0199 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.635 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50287426 0.4971258 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.061 [MASKS] A(Pass/Fail): 662/1386 | B: 607/1441 | C: 300/1076 [LOSS Ex1] A: 0.65129 | B: 0.64095 | C: 0.63446 [LOGITS Ex2 A] Mean Abs: 1.978 | Max: 5.956 [LOSS Ex2] A: 0.12303 | B: 0.36451 | C: 0.27322 ** [JOINT LOSS] ** : 0.895821 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002997 | Grad Max: 0.073089 -> Layer: shared_layers.0.bias | Grad Mean: 0.109426 | Grad Max: 0.487654 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.006608 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011286 | Grad Max: 0.011286 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000772 | Grad Max: 0.390631 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013331 | Grad Max: 2.184359 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003922 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002041 | Grad Max: 0.024002 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000186 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000323 | Grad Max: 0.002346 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000060 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000088 | Grad Max: 0.000457 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000382 | Grad Max: 0.000932 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000034 | Grad Max: 0.000034 [GRADIENT NORM TOTAL] 3.6679 [EPOCH SUMMARY] Train Loss: 0.8956 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8802 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 108/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.107 | Max: 0.564 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5416047 0.4583953] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.060 [MASKS] A(Pass/Fail): 662/1386 | B: 608/1440 | C: 447/1601 [LOSS Ex1] A: 0.65120 | B: 0.63678 | C: 0.63528 [LOGITS Ex2 A] Mean Abs: 1.976 | Max: 5.833 [LOSS Ex2] A: 0.14540 | B: 0.35122 | C: 0.26691 ** [JOINT LOSS] ** : 0.895600 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003119 | Grad Max: 0.126220 -> Layer: shared_layers.0.bias | Grad Mean: 0.313670 | Grad Max: 1.502957 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.006071 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004816 | Grad Max: 0.004816 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.275892 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037854 | Grad Max: 1.536287 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000307 | Grad Max: 0.014235 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019232 | Grad Max: 0.144328 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000454 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003936 | Grad Max: 0.009263 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000191 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001026 | Grad Max: 0.002431 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000896 | Grad Max: 0.002045 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019762 | Grad Max: 0.019762 [GRADIENT NORM TOTAL] 6.6807 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.716 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.71985805 0.28014192] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.063 [MASKS] A(Pass/Fail): 712/1336 | B: 570/1286 | C: 461/1587 [LOSS Ex1] A: 0.64580 | B: 0.64092 | C: 0.63467 [LOGITS Ex2 A] Mean Abs: 2.019 | Max: 6.337 [LOSS Ex2] A: 0.13825 | B: 0.33835 | C: 0.24722 ** [JOINT LOSS] ** : 0.881734 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003096 | Grad Max: 0.089401 -> Layer: shared_layers.0.bias | Grad Mean: 0.180879 | Grad Max: 0.858002 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.006212 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006989 | Grad Max: 0.006989 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001201 | Grad Max: 0.231524 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022128 | Grad Max: 1.290830 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.006822 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010349 | Grad Max: 0.055254 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000268 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002219 | Grad Max: 0.005354 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000121 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000611 | Grad Max: 0.001514 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000600 | Grad Max: 0.001950 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012782 | Grad Max: 0.012782 [GRADIENT NORM TOTAL] 4.0782 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.797 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009278 0.4990722] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.062 [MASKS] A(Pass/Fail): 699/1349 | B: 608/1440 | C: 441/1607 [LOSS Ex1] A: 0.65189 | B: 0.63999 | C: 0.63521 [LOGITS Ex2 A] Mean Abs: 1.974 | Max: 6.484 [LOSS Ex2] A: 0.13849 | B: 0.37502 | C: 0.24820 ** [JOINT LOSS] ** : 0.896270 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006224 | Grad Max: 0.201217 -> Layer: shared_layers.0.bias | Grad Mean: 0.329595 | Grad Max: 1.366487 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005630 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000037 | Grad Max: 0.000037 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002223 | Grad Max: 0.274556 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040259 | Grad Max: 1.479585 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000348 | Grad Max: 0.011182 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021288 | Grad Max: 0.107245 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000459 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004692 | Grad Max: 0.009699 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000225 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001246 | Grad Max: 0.002921 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001170 | Grad Max: 0.002320 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024437 | Grad Max: 0.024437 [GRADIENT NORM TOTAL] 6.4096 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.546 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6756378 0.32436216] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.063 [MASKS] A(Pass/Fail): 683/1365 | B: 607/1441 | C: 470/1578 [LOSS Ex1] A: 0.64819 | B: 0.64076 | C: 0.63380 [LOGITS Ex2 A] Mean Abs: 1.975 | Max: 6.560 [LOSS Ex2] A: 0.15112 | B: 0.37619 | C: 0.24735 ** [JOINT LOSS] ** : 0.899139 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007019 | Grad Max: 0.180353 -> Layer: shared_layers.0.bias | Grad Mean: 0.400353 | Grad Max: 1.634655 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.006540 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007781 | Grad Max: 0.007781 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002512 | Grad Max: 0.258309 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046562 | Grad Max: 1.333758 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.015328 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025769 | Grad Max: 0.141154 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000588 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005530 | Grad Max: 0.011235 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000288 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001465 | Grad Max: 0.003447 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001456 | Grad Max: 0.002825 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029282 | Grad Max: 0.029282 [GRADIENT NORM TOTAL] 7.3534 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.636 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6088483 0.3911517] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.063 [MASKS] A(Pass/Fail): 578/1038 | B: 608/1440 | C: 433/1615 [LOSS Ex1] A: 0.64656 | B: 0.63659 | C: 0.63922 [LOGITS Ex2 A] Mean Abs: 2.048 | Max: 7.499 [LOSS Ex2] A: 0.13732 | B: 0.34665 | C: 0.26778 ** [JOINT LOSS] ** : 0.891373 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002280 | Grad Max: 0.120304 -> Layer: shared_layers.0.bias | Grad Mean: 0.248335 | Grad Max: 1.447143 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006319 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007277 | Grad Max: 0.007277 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001563 | Grad Max: 0.262363 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028639 | Grad Max: 1.472456 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000195 | Grad Max: 0.007874 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012202 | Grad Max: 0.065659 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000304 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002532 | Grad Max: 0.005981 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000133 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000668 | Grad Max: 0.001589 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000540 | Grad Max: 0.001681 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012818 | Grad Max: 0.012818 [GRADIENT NORM TOTAL] 5.5896 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.798 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071631 0.4928369] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.063 [MASKS] A(Pass/Fail): 695/1353 | B: 570/1286 | C: 487/1561 [LOSS Ex1] A: 0.64707 | B: 0.64074 | C: 0.63576 [LOGITS Ex2 A] Mean Abs: 2.045 | Max: 8.265 [LOSS Ex2] A: 0.12930 | B: 0.33782 | C: 0.28932 ** [JOINT LOSS] ** : 0.893341 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004377 | Grad Max: 0.115427 -> Layer: shared_layers.0.bias | Grad Mean: 0.300145 | Grad Max: 1.398243 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002236 | Grad Max: 0.006122 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007924 | Grad Max: 0.007924 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.304767 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037394 | Grad Max: 1.696210 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000296 | Grad Max: 0.010011 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018200 | Grad Max: 0.097177 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000418 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003889 | Grad Max: 0.008675 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000215 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001036 | Grad Max: 0.002643 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001021 | Grad Max: 0.002392 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021114 | Grad Max: 0.021114 [GRADIENT NORM TOTAL] 6.4336 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.750 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.510954 0.48904595] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.063 [MASKS] A(Pass/Fail): 690/1358 | B: 609/1439 | C: 474/1574 [LOSS Ex1] A: 0.64430 | B: 0.63982 | C: 0.63107 [LOGITS Ex2 A] Mean Abs: 2.015 | Max: 5.590 [LOSS Ex2] A: 0.13603 | B: 0.36484 | C: 0.26697 ** [JOINT LOSS] ** : 0.894342 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002866 | Grad Max: 0.065265 -> Layer: shared_layers.0.bias | Grad Mean: 0.084908 | Grad Max: 0.441308 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002352 | Grad Max: 0.006653 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006544 | Grad Max: 0.006544 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000765 | Grad Max: 0.193133 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013247 | Grad Max: 1.081777 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.003220 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005052 | Grad Max: 0.025503 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000189 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001132 | Grad Max: 0.003452 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000085 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000293 | Grad Max: 0.001002 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000331 | Grad Max: 0.001106 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006141 | Grad Max: 0.006141 [GRADIENT NORM TOTAL] 2.4173 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.774 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5031857 0.49681428] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.063 [MASKS] A(Pass/Fail): 684/1364 | B: 607/1441 | C: 473/1575 [LOSS Ex1] A: 0.64331 | B: 0.64059 | C: 0.63225 [LOGITS Ex2 A] Mean Abs: 1.973 | Max: 6.455 [LOSS Ex2] A: 0.15721 | B: 0.38250 | C: 0.26074 ** [JOINT LOSS] ** : 0.905535 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005656 | Grad Max: 0.184304 -> Layer: shared_layers.0.bias | Grad Mean: 0.509514 | Grad Max: 2.418754 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002270 | Grad Max: 0.006347 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000939 | Grad Max: 0.000939 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003101 | Grad Max: 0.535005 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058024 | Grad Max: 2.952957 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000463 | Grad Max: 0.014967 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028946 | Grad Max: 0.150901 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000618 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006014 | Grad Max: 0.012597 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000292 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001587 | Grad Max: 0.003527 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001491 | Grad Max: 0.002569 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031495 | Grad Max: 0.031495 [GRADIENT NORM TOTAL] 10.5124 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.640 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5028316 0.49716848] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.062 [MASKS] A(Pass/Fail): 663/1385 | B: 608/1440 | C: 489/1559 [LOSS Ex1] A: 0.65106 | B: 0.63641 | C: 0.63177 [LOGITS Ex2 A] Mean Abs: 1.929 | Max: 6.682 [LOSS Ex2] A: 0.14364 | B: 0.36640 | C: 0.25972 ** [JOINT LOSS] ** : 0.896335 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007240 | Grad Max: 0.176619 -> Layer: shared_layers.0.bias | Grad Mean: 0.572778 | Grad Max: 2.490349 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006045 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006523 | Grad Max: 0.006523 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003748 | Grad Max: 0.422524 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070169 | Grad Max: 2.383838 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000584 | Grad Max: 0.018924 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036568 | Grad Max: 0.197122 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000817 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007763 | Grad Max: 0.017055 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000394 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002056 | Grad Max: 0.004858 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001964 | Grad Max: 0.003385 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040706 | Grad Max: 0.040706 [GRADIENT NORM TOTAL] 11.7107 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.568 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54179025 0.45820972] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.548 | Std: 0.060 [MASKS] A(Pass/Fail): 662/1386 | B: 570/1286 | C: 462/1586 [LOSS Ex1] A: 0.65098 | B: 0.64057 | C: 0.63488 [LOGITS Ex2 A] Mean Abs: 1.941 | Max: 7.488 [LOSS Ex2] A: 0.14105 | B: 0.34496 | C: 0.25891 ** [JOINT LOSS] ** : 0.890446 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003196 | Grad Max: 0.076776 -> Layer: shared_layers.0.bias | Grad Mean: 0.207932 | Grad Max: 0.956079 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.005653 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000403 | Grad Max: 0.000403 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001350 | Grad Max: 0.232827 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024880 | Grad Max: 1.307485 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000214 | Grad Max: 0.009438 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013276 | Grad Max: 0.085765 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000341 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002800 | Grad Max: 0.006401 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000177 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000736 | Grad Max: 0.002040 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000686 | Grad Max: 0.001930 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014381 | Grad Max: 0.014381 [GRADIENT NORM TOTAL] 4.2408 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.721 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7211653 0.27883464] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.063 [MASKS] A(Pass/Fail): 712/1336 | B: 609/1439 | C: 476/1572 [LOSS Ex1] A: 0.64555 | B: 0.63964 | C: 0.63305 [LOGITS Ex2 A] Mean Abs: 2.065 | Max: 6.254 [LOSS Ex2] A: 0.14648 | B: 0.36853 | C: 0.29700 ** [JOINT LOSS] ** : 0.910086 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007885 | Grad Max: 0.205686 -> Layer: shared_layers.0.bias | Grad Mean: 0.584902 | Grad Max: 2.587913 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002247 | Grad Max: 0.005792 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002407 | Grad Max: 0.002407 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003764 | Grad Max: 0.426963 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069852 | Grad Max: 2.324125 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000547 | Grad Max: 0.019074 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034167 | Grad Max: 0.194871 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000690 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007239 | Grad Max: 0.014610 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000334 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001871 | Grad Max: 0.004285 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001735 | Grad Max: 0.003123 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035516 | Grad Max: 0.035516 [GRADIENT NORM TOTAL] 11.8424 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.801 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009626 0.4990374] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.062 [MASKS] A(Pass/Fail): 699/1349 | B: 607/1441 | C: 457/1591 [LOSS Ex1] A: 0.65168 | B: 0.64042 | C: 0.63544 [LOGITS Ex2 A] Mean Abs: 2.082 | Max: 6.105 [LOSS Ex2] A: 0.14158 | B: 0.39517 | C: 0.29010 ** [JOINT LOSS] ** : 0.918128 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008829 | Grad Max: 0.314288 -> Layer: shared_layers.0.bias | Grad Mean: 0.899708 | Grad Max: 4.253185 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.005510 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003792 | Grad Max: 0.003792 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005724 | Grad Max: 0.634322 -> Layer: exit2_layers.0.bias | Grad Mean: 0.107226 | Grad Max: 3.591946 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000863 | Grad Max: 0.033421 -> Layer: exit2_layers.3.bias | Grad Mean: 0.054270 | Grad Max: 0.324101 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000105 | Grad Max: 0.001126 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011360 | Grad Max: 0.022736 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000496 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002976 | Grad Max: 0.006812 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002701 | Grad Max: 0.004601 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057296 | Grad Max: 0.057296 [GRADIENT NORM TOTAL] 18.6740 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.549 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6765464 0.32345363] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.063 [MASKS] A(Pass/Fail): 684/1364 | B: 608/1440 | C: 427/1621 [LOSS Ex1] A: 0.64797 | B: 0.63625 | C: 0.63629 [LOGITS Ex2 A] Mean Abs: 2.064 | Max: 6.247 [LOSS Ex2] A: 0.16054 | B: 0.36157 | C: 0.29683 ** [JOINT LOSS] ** : 0.913151 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006997 | Grad Max: 0.223177 -> Layer: shared_layers.0.bias | Grad Mean: 0.680833 | Grad Max: 2.852823 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.006158 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000879 | Grad Max: 0.000879 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004273 | Grad Max: 0.500297 -> Layer: exit2_layers.0.bias | Grad Mean: 0.080234 | Grad Max: 2.826247 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000671 | Grad Max: 0.024937 -> Layer: exit2_layers.3.bias | Grad Mean: 0.042353 | Grad Max: 0.251689 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.000862 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008805 | Grad Max: 0.018374 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000396 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002311 | Grad Max: 0.005367 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002057 | Grad Max: 0.003616 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044062 | Grad Max: 0.044062 [GRADIENT NORM TOTAL] 13.8491 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.641 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6093364 0.3906636] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.063 [MASKS] A(Pass/Fail): 578/1038 | B: 570/1286 | C: 299/1077 [LOSS Ex1] A: 0.64634 | B: 0.64041 | C: 0.63322 [LOGITS Ex2 A] Mean Abs: 2.050 | Max: 5.807 [LOSS Ex2] A: 0.13632 | B: 0.34107 | C: 0.25672 ** [JOINT LOSS] ** : 0.884692 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001700 | Grad Max: 0.025605 -> Layer: shared_layers.0.bias | Grad Mean: 0.077471 | Grad Max: 0.265646 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.006100 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002044 | Grad Max: 0.002044 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000649 | Grad Max: 0.348041 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011307 | Grad Max: 1.953935 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.003467 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002345 | Grad Max: 0.026262 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000172 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000425 | Grad Max: 0.002821 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000061 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000113 | Grad Max: 0.000514 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000405 | Grad Max: 0.001110 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001691 | Grad Max: 0.001691 [GRADIENT NORM TOTAL] 3.0918 [EPOCH SUMMARY] Train Loss: 0.8979 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8840 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 109/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.803 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071257 0.4928743] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 695/1353 | B: 609/1439 | C: 444/1604 [LOSS Ex1] A: 0.64686 | B: 0.63950 | C: 0.63707 [LOGITS Ex2 A] Mean Abs: 2.000 | Max: 6.311 [LOSS Ex2] A: 0.13694 | B: 0.39071 | C: 0.27925 ** [JOINT LOSS] ** : 0.910110 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008388 | Grad Max: 0.241826 -> Layer: shared_layers.0.bias | Grad Mean: 0.591426 | Grad Max: 2.685166 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005838 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003237 | Grad Max: 0.003237 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003775 | Grad Max: 0.463758 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070013 | Grad Max: 2.584737 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000550 | Grad Max: 0.018746 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034403 | Grad Max: 0.185875 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000735 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007352 | Grad Max: 0.015126 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000350 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001965 | Grad Max: 0.004511 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001874 | Grad Max: 0.003321 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039865 | Grad Max: 0.039865 [GRADIENT NORM TOTAL] 11.8331 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.755 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51096463 0.48903537] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.063 [MASKS] A(Pass/Fail): 691/1357 | B: 607/1441 | C: 449/1599 [LOSS Ex1] A: 0.64408 | B: 0.64028 | C: 0.63428 [LOGITS Ex2 A] Mean Abs: 1.982 | Max: 6.578 [LOSS Ex2] A: 0.13443 | B: 0.39042 | C: 0.26743 ** [JOINT LOSS] ** : 0.903639 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007567 | Grad Max: 0.196971 -> Layer: shared_layers.0.bias | Grad Mean: 0.623308 | Grad Max: 2.729717 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002370 | Grad Max: 0.006803 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011067 | Grad Max: 0.011067 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003920 | Grad Max: 0.487180 -> Layer: exit2_layers.0.bias | Grad Mean: 0.073128 | Grad Max: 2.758421 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000585 | Grad Max: 0.020431 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036886 | Grad Max: 0.204904 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000681 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007867 | Grad Max: 0.015479 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000385 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002080 | Grad Max: 0.005015 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001890 | Grad Max: 0.003649 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040556 | Grad Max: 0.040556 [GRADIENT NORM TOTAL] 12.4371 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.778 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5032143 0.49678567] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.063 [MASKS] A(Pass/Fail): 685/1363 | B: 608/1440 | C: 449/1599 [LOSS Ex1] A: 0.64310 | B: 0.63611 | C: 0.63380 [LOGITS Ex2 A] Mean Abs: 1.980 | Max: 6.685 [LOSS Ex2] A: 0.14603 | B: 0.35141 | C: 0.24968 ** [JOINT LOSS] ** : 0.886707 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003536 | Grad Max: 0.095605 -> Layer: shared_layers.0.bias | Grad Mean: 0.265313 | Grad Max: 0.961489 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002335 | Grad Max: 0.006120 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002133 | Grad Max: 0.002133 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001667 | Grad Max: 0.260715 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030723 | Grad Max: 1.461794 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000248 | Grad Max: 0.011682 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015387 | Grad Max: 0.113249 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000344 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003115 | Grad Max: 0.006534 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000162 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000806 | Grad Max: 0.002015 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000731 | Grad Max: 0.001841 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014944 | Grad Max: 0.014944 [GRADIENT NORM TOTAL] 5.3857 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.644 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5028035 0.4971965] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.062 [MASKS] A(Pass/Fail): 663/1385 | B: 570/1286 | C: 466/1582 [LOSS Ex1] A: 0.65088 | B: 0.64027 | C: 0.63512 [LOGITS Ex2 A] Mean Abs: 1.997 | Max: 5.486 [LOSS Ex2] A: 0.13985 | B: 0.34426 | C: 0.28503 ** [JOINT LOSS] ** : 0.898468 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004274 | Grad Max: 0.143856 -> Layer: shared_layers.0.bias | Grad Mean: 0.363221 | Grad Max: 1.782512 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.006240 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009565 | Grad Max: 0.009565 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.371051 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042964 | Grad Max: 2.073478 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000355 | Grad Max: 0.013300 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022351 | Grad Max: 0.121769 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000506 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004713 | Grad Max: 0.010713 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000236 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001251 | Grad Max: 0.002990 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001102 | Grad Max: 0.002323 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024163 | Grad Max: 0.024163 [GRADIENT NORM TOTAL] 7.7659 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.108 | Max: 0.572 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5417897 0.4582103] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.060 [MASKS] A(Pass/Fail): 662/1386 | B: 609/1439 | C: 485/1563 [LOSS Ex1] A: 0.65081 | B: 0.63936 | C: 0.63334 [LOGITS Ex2 A] Mean Abs: 2.022 | Max: 5.930 [LOSS Ex2] A: 0.14745 | B: 0.37262 | C: 0.29537 ** [JOINT LOSS] ** : 0.912983 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007328 | Grad Max: 0.232219 -> Layer: shared_layers.0.bias | Grad Mean: 0.641800 | Grad Max: 3.028861 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005723 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005706 | Grad Max: 0.005706 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004093 | Grad Max: 0.473843 -> Layer: exit2_layers.0.bias | Grad Mean: 0.076659 | Grad Max: 2.650718 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000624 | Grad Max: 0.024488 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039427 | Grad Max: 0.225977 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000793 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008314 | Grad Max: 0.016510 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000397 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002184 | Grad Max: 0.005016 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001929 | Grad Max: 0.003473 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041303 | Grad Max: 0.041303 [GRADIENT NORM TOTAL] 13.1337 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.724 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7221304 0.27786958] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.063 [MASKS] A(Pass/Fail): 712/1336 | B: 607/1441 | C: 457/1591 [LOSS Ex1] A: 0.64537 | B: 0.64015 | C: 0.63456 [LOGITS Ex2 A] Mean Abs: 2.036 | Max: 6.734 [LOSS Ex2] A: 0.13752 | B: 0.36563 | C: 0.27635 ** [JOINT LOSS] ** : 0.899857 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005916 | Grad Max: 0.150784 -> Layer: shared_layers.0.bias | Grad Mean: 0.414319 | Grad Max: 1.523596 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.006422 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000874 | Grad Max: 0.000874 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002657 | Grad Max: 0.392569 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048958 | Grad Max: 2.196939 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000388 | Grad Max: 0.013311 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024434 | Grad Max: 0.127284 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000558 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005213 | Grad Max: 0.011032 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000245 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001357 | Grad Max: 0.003072 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001172 | Grad Max: 0.002416 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025014 | Grad Max: 0.025014 [GRADIENT NORM TOTAL] 8.4670 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.806 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50095206 0.49904794] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.062 [MASKS] A(Pass/Fail): 700/1348 | B: 608/1440 | C: 457/1591 [LOSS Ex1] A: 0.65151 | B: 0.63598 | C: 0.63732 [LOGITS Ex2 A] Mean Abs: 1.985 | Max: 6.312 [LOSS Ex2] A: 0.12831 | B: 0.35519 | C: 0.25453 ** [JOINT LOSS] ** : 0.887613 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003934 | Grad Max: 0.126344 -> Layer: shared_layers.0.bias | Grad Mean: 0.225344 | Grad Max: 0.948809 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.005935 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001857 | Grad Max: 0.001857 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001517 | Grad Max: 0.229924 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027597 | Grad Max: 1.291338 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.008889 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013808 | Grad Max: 0.080393 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000338 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002994 | Grad Max: 0.006649 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000156 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000783 | Grad Max: 0.002004 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000725 | Grad Max: 0.001961 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014786 | Grad Max: 0.014786 [GRADIENT NORM TOTAL] 4.7139 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.552 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.67724586 0.32275417] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.063 [MASKS] A(Pass/Fail): 684/1364 | B: 570/1286 | C: 434/1614 [LOSS Ex1] A: 0.64779 | B: 0.64015 | C: 0.63369 [LOGITS Ex2 A] Mean Abs: 1.970 | Max: 6.163 [LOSS Ex2] A: 0.15100 | B: 0.36494 | C: 0.27570 ** [JOINT LOSS] ** : 0.904424 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005815 | Grad Max: 0.178007 -> Layer: shared_layers.0.bias | Grad Mean: 0.388153 | Grad Max: 1.624420 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.006107 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003963 | Grad Max: 0.003963 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002446 | Grad Max: 0.373433 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045103 | Grad Max: 2.050425 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000376 | Grad Max: 0.013883 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023317 | Grad Max: 0.129764 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000485 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004947 | Grad Max: 0.010679 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000255 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001303 | Grad Max: 0.003233 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001141 | Grad Max: 0.002185 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024790 | Grad Max: 0.024790 [GRADIENT NORM TOTAL] 7.7401 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.644 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.609627 0.39037293] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.063 [MASKS] A(Pass/Fail): 579/1037 | B: 610/1438 | C: 489/1559 [LOSS Ex1] A: 0.64616 | B: 0.63924 | C: 0.63376 [LOGITS Ex2 A] Mean Abs: 2.015 | Max: 6.230 [LOSS Ex2] A: 0.13781 | B: 0.37023 | C: 0.24822 ** [JOINT LOSS] ** : 0.891809 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.061111 -> Layer: shared_layers.0.bias | Grad Mean: 0.111528 | Grad Max: 0.580055 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.006420 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000699 | Grad Max: 0.000699 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000867 | Grad Max: 0.230776 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015484 | Grad Max: 1.303083 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.004572 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006084 | Grad Max: 0.040072 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000197 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001255 | Grad Max: 0.003599 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000098 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000346 | Grad Max: 0.001001 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000389 | Grad Max: 0.001230 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007826 | Grad Max: 0.007826 [GRADIENT NORM TOTAL] 3.1290 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.807 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071226 0.49287742] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 696/1352 | B: 607/1441 | C: 468/1580 [LOSS Ex1] A: 0.64669 | B: 0.64002 | C: 0.63073 [LOGITS Ex2 A] Mean Abs: 2.049 | Max: 6.741 [LOSS Ex2] A: 0.13311 | B: 0.36744 | C: 0.25706 ** [JOINT LOSS] ** : 0.891682 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006638 | Grad Max: 0.202962 -> Layer: shared_layers.0.bias | Grad Mean: 0.537367 | Grad Max: 2.544029 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002279 | Grad Max: 0.006212 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005077 | Grad Max: 0.005077 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003480 | Grad Max: 0.492374 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065133 | Grad Max: 2.748162 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000494 | Grad Max: 0.018710 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031135 | Grad Max: 0.182088 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000640 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006625 | Grad Max: 0.013026 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000321 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001770 | Grad Max: 0.003925 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001632 | Grad Max: 0.003567 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035230 | Grad Max: 0.035230 [GRADIENT NORM TOTAL] 11.3716 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.758 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.510898 0.48910198] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.063 [MASKS] A(Pass/Fail): 691/1357 | B: 608/1440 | C: 491/1557 [LOSS Ex1] A: 0.64390 | B: 0.63585 | C: 0.62924 [LOGITS Ex2 A] Mean Abs: 2.048 | Max: 5.952 [LOSS Ex2] A: 0.14526 | B: 0.35819 | C: 0.26562 ** [JOINT LOSS] ** : 0.892686 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008090 | Grad Max: 0.226417 -> Layer: shared_layers.0.bias | Grad Mean: 0.624733 | Grad Max: 2.938836 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002392 | Grad Max: 0.006022 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002200 | Grad Max: 0.002200 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004007 | Grad Max: 0.471668 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074705 | Grad Max: 2.635655 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000592 | Grad Max: 0.021053 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037389 | Grad Max: 0.205060 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000761 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007959 | Grad Max: 0.016428 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000340 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002110 | Grad Max: 0.004714 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001902 | Grad Max: 0.003458 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041087 | Grad Max: 0.041087 [GRADIENT NORM TOTAL] 12.8878 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.782 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5032491 0.49675092] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.063 [MASKS] A(Pass/Fail): 685/1363 | B: 570/1286 | C: 451/1597 [LOSS Ex1] A: 0.64293 | B: 0.64002 | C: 0.63503 [LOGITS Ex2 A] Mean Abs: 1.996 | Max: 7.236 [LOSS Ex2] A: 0.15412 | B: 0.34918 | C: 0.28143 ** [JOINT LOSS] ** : 0.900903 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005681 | Grad Max: 0.187685 -> Layer: shared_layers.0.bias | Grad Mean: 0.258176 | Grad Max: 1.257170 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002236 | Grad Max: 0.006009 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002066 | Grad Max: 0.002066 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001796 | Grad Max: 0.275371 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032209 | Grad Max: 1.548232 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000261 | Grad Max: 0.010097 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016132 | Grad Max: 0.104013 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000388 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003535 | Grad Max: 0.007640 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000205 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000919 | Grad Max: 0.002156 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000753 | Grad Max: 0.001782 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016556 | Grad Max: 0.016556 [GRADIENT NORM TOTAL] 5.5991 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.647 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50276625 0.4972337 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.062 [MASKS] A(Pass/Fail): 664/1384 | B: 610/1438 | C: 482/1566 [LOSS Ex1] A: 0.65072 | B: 0.63912 | C: 0.63395 [LOGITS Ex2 A] Mean Abs: 1.927 | Max: 6.089 [LOSS Ex2] A: 0.13518 | B: 0.38421 | C: 0.26593 ** [JOINT LOSS] ** : 0.903034 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005062 | Grad Max: 0.145214 -> Layer: shared_layers.0.bias | Grad Mean: 0.479391 | Grad Max: 1.928931 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.005980 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007024 | Grad Max: 0.007024 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003099 | Grad Max: 0.352762 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057929 | Grad Max: 2.005474 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.017869 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030493 | Grad Max: 0.169060 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000655 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006473 | Grad Max: 0.013715 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000282 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001738 | Grad Max: 0.003866 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001632 | Grad Max: 0.003052 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034628 | Grad Max: 0.034628 [GRADIENT NORM TOTAL] 9.6529 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.574 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54187423 0.45812574] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.060 [MASKS] A(Pass/Fail): 664/1384 | B: 607/1441 | C: 300/1076 [LOSS Ex1] A: 0.65065 | B: 0.63990 | C: 0.63353 [LOGITS Ex2 A] Mean Abs: 1.879 | Max: 6.171 [LOSS Ex2] A: 0.15267 | B: 0.39556 | C: 0.27008 ** [JOINT LOSS] ** : 0.914130 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008560 | Grad Max: 0.207235 -> Layer: shared_layers.0.bias | Grad Mean: 0.672711 | Grad Max: 2.779246 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.006481 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010033 | Grad Max: 0.010033 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004256 | Grad Max: 0.451210 -> Layer: exit2_layers.0.bias | Grad Mean: 0.080048 | Grad Max: 2.521109 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000673 | Grad Max: 0.024841 -> Layer: exit2_layers.3.bias | Grad Mean: 0.042448 | Grad Max: 0.231237 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000859 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008941 | Grad Max: 0.017867 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000435 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002384 | Grad Max: 0.005305 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002163 | Grad Max: 0.003849 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046559 | Grad Max: 0.046559 [GRADIENT NORM TOTAL] 13.3719 [EPOCH SUMMARY] Train Loss: 0.8999 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8789 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8797 -> New: 0.8789) ############################## EPOCH 110/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.728 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7230741 0.2769259] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.064 [MASKS] A(Pass/Fail): 713/1335 | B: 608/1440 | C: 451/1597 [LOSS Ex1] A: 0.64519 | B: 0.63573 | C: 0.63711 [LOGITS Ex2 A] Mean Abs: 1.954 | Max: 6.065 [LOSS Ex2] A: 0.13296 | B: 0.35433 | C: 0.27847 ** [JOINT LOSS] ** : 0.894597 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003585 | Grad Max: 0.107511 -> Layer: shared_layers.0.bias | Grad Mean: 0.331384 | Grad Max: 1.428852 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.005792 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000318 | Grad Max: 0.000318 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.519320 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038572 | Grad Max: 2.883473 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000307 | Grad Max: 0.010803 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019454 | Grad Max: 0.098846 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000445 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004140 | Grad Max: 0.009386 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000213 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001107 | Grad Max: 0.002795 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000967 | Grad Max: 0.002200 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021495 | Grad Max: 0.021495 [GRADIENT NORM TOTAL] 7.1900 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.809 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50095963 0.4990404 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.063 [MASKS] A(Pass/Fail): 700/1348 | B: 570/1286 | C: 457/1591 [LOSS Ex1] A: 0.65136 | B: 0.63991 | C: 0.63514 [LOGITS Ex2 A] Mean Abs: 2.012 | Max: 5.633 [LOSS Ex2] A: 0.13149 | B: 0.33780 | C: 0.25427 ** [JOINT LOSS] ** : 0.883322 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002836 | Grad Max: 0.120675 -> Layer: shared_layers.0.bias | Grad Mean: 0.289602 | Grad Max: 1.379561 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.005457 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001655 | Grad Max: 0.001655 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001929 | Grad Max: 0.264667 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035486 | Grad Max: 1.480762 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000276 | Grad Max: 0.009799 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017407 | Grad Max: 0.098196 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000367 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003541 | Grad Max: 0.007504 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000167 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000961 | Grad Max: 0.002184 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000866 | Grad Max: 0.002330 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019745 | Grad Max: 0.019745 [GRADIENT NORM TOTAL] 6.2958 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.555 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6779082 0.3220918] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.064 [MASKS] A(Pass/Fail): 685/1363 | B: 610/1438 | C: 461/1587 [LOSS Ex1] A: 0.64762 | B: 0.63901 | C: 0.63270 [LOGITS Ex2 A] Mean Abs: 2.006 | Max: 6.215 [LOSS Ex2] A: 0.15601 | B: 0.36427 | C: 0.27143 ** [JOINT LOSS] ** : 0.903679 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004660 | Grad Max: 0.180078 -> Layer: shared_layers.0.bias | Grad Mean: 0.475921 | Grad Max: 2.200845 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006251 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001513 | Grad Max: 0.001513 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002953 | Grad Max: 0.341563 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054857 | Grad Max: 1.925379 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000468 | Grad Max: 0.017789 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029750 | Grad Max: 0.169325 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000579 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006233 | Grad Max: 0.012463 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000293 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001676 | Grad Max: 0.003963 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001514 | Grad Max: 0.003112 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033458 | Grad Max: 0.033458 [GRADIENT NORM TOTAL] 9.5248 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.647 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61001176 0.3899882 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.064 [MASKS] A(Pass/Fail): 579/1037 | B: 607/1441 | C: 466/1582 [LOSS Ex1] A: 0.64599 | B: 0.63980 | C: 0.63414 [LOGITS Ex2 A] Mean Abs: 2.010 | Max: 6.268 [LOSS Ex2] A: 0.12863 | B: 0.36448 | C: 0.26701 ** [JOINT LOSS] ** : 0.893345 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002554 | Grad Max: 0.078741 -> Layer: shared_layers.0.bias | Grad Mean: 0.196105 | Grad Max: 0.975178 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.006595 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000388 | Grad Max: 0.000388 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001366 | Grad Max: 0.201521 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024816 | Grad Max: 1.107025 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000177 | Grad Max: 0.006903 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011298 | Grad Max: 0.064166 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000288 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002372 | Grad Max: 0.005601 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000129 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000625 | Grad Max: 0.001467 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000485 | Grad Max: 0.001588 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011166 | Grad Max: 0.011166 [GRADIENT NORM TOTAL] 4.3118 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.811 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071314 0.49286854] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 696/1352 | B: 608/1440 | C: 480/1568 [LOSS Ex1] A: 0.64652 | B: 0.63562 | C: 0.62950 [LOGITS Ex2 A] Mean Abs: 1.974 | Max: 7.122 [LOSS Ex2] A: 0.14178 | B: 0.35594 | C: 0.23169 ** [JOINT LOSS] ** : 0.880352 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006597 | Grad Max: 0.179681 -> Layer: shared_layers.0.bias | Grad Mean: 0.437284 | Grad Max: 1.730302 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.005965 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002244 | Grad Max: 0.002244 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002746 | Grad Max: 0.294776 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051210 | Grad Max: 1.669751 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000427 | Grad Max: 0.014904 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026922 | Grad Max: 0.145342 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000565 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005685 | Grad Max: 0.011527 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000311 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001477 | Grad Max: 0.004157 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001281 | Grad Max: 0.002939 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026915 | Grad Max: 0.026915 [GRADIENT NORM TOTAL] 8.4993 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.761 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51089954 0.48910046] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 691/1357 | B: 570/1286 | C: 459/1589 [LOSS Ex1] A: 0.64373 | B: 0.63981 | C: 0.63341 [LOGITS Ex2 A] Mean Abs: 1.943 | Max: 6.098 [LOSS Ex2] A: 0.13733 | B: 0.36874 | C: 0.25994 ** [JOINT LOSS] ** : 0.894320 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004923 | Grad Max: 0.152986 -> Layer: shared_layers.0.bias | Grad Mean: 0.499393 | Grad Max: 2.193295 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.006473 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007127 | Grad Max: 0.007127 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003031 | Grad Max: 0.337489 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056944 | Grad Max: 1.896135 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000464 | Grad Max: 0.019524 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029505 | Grad Max: 0.189991 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000618 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006177 | Grad Max: 0.012400 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000288 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001642 | Grad Max: 0.003990 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001394 | Grad Max: 0.002740 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030995 | Grad Max: 0.030995 [GRADIENT NORM TOTAL] 9.9740 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.785 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5032697 0.49673027] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.063 [MASKS] A(Pass/Fail): 685/1363 | B: 610/1438 | C: 483/1565 [LOSS Ex1] A: 0.64275 | B: 0.63891 | C: 0.63020 [LOGITS Ex2 A] Mean Abs: 1.942 | Max: 6.604 [LOSS Ex2] A: 0.15510 | B: 0.36935 | C: 0.26332 ** [JOINT LOSS] ** : 0.899880 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003649 | Grad Max: 0.107519 -> Layer: shared_layers.0.bias | Grad Mean: 0.239502 | Grad Max: 1.356242 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002308 | Grad Max: 0.006751 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002661 | Grad Max: 0.002661 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001541 | Grad Max: 0.235423 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027037 | Grad Max: 1.313451 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000191 | Grad Max: 0.007971 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011866 | Grad Max: 0.077784 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000302 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002380 | Grad Max: 0.005563 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000139 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000640 | Grad Max: 0.001754 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000569 | Grad Max: 0.001493 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012713 | Grad Max: 0.012713 [GRADIENT NORM TOTAL] 4.9835 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.649 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5027327 0.49726734] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.549 | Std: 0.062 [MASKS] A(Pass/Fail): 664/1384 | B: 607/1441 | C: 468/1580 [LOSS Ex1] A: 0.65057 | B: 0.63969 | C: 0.63189 [LOGITS Ex2 A] Mean Abs: 1.978 | Max: 6.172 [LOSS Ex2] A: 0.13594 | B: 0.36308 | C: 0.26909 ** [JOINT LOSS] ** : 0.896754 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006127 | Grad Max: 0.169527 -> Layer: shared_layers.0.bias | Grad Mean: 0.450802 | Grad Max: 1.966999 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002114 | Grad Max: 0.006137 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009129 | Grad Max: 0.009129 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002846 | Grad Max: 0.410159 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052476 | Grad Max: 2.290655 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000404 | Grad Max: 0.015529 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025615 | Grad Max: 0.159686 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000516 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005497 | Grad Max: 0.011242 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000258 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001462 | Grad Max: 0.003346 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001210 | Grad Max: 0.002700 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027380 | Grad Max: 0.027380 [GRADIENT NORM TOTAL] 9.2608 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.577 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5419415 0.45805842] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.061 [MASKS] A(Pass/Fail): 664/1384 | B: 609/1439 | C: 447/1601 [LOSS Ex1] A: 0.65050 | B: 0.63551 | C: 0.63623 [LOGITS Ex2 A] Mean Abs: 1.975 | Max: 6.003 [LOSS Ex2] A: 0.14683 | B: 0.35251 | C: 0.29063 ** [JOINT LOSS] ** : 0.904067 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007444 | Grad Max: 0.198781 -> Layer: shared_layers.0.bias | Grad Mean: 0.611085 | Grad Max: 2.506172 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.006310 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008645 | Grad Max: 0.008645 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003930 | Grad Max: 0.495496 -> Layer: exit2_layers.0.bias | Grad Mean: 0.073148 | Grad Max: 2.766162 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000581 | Grad Max: 0.021963 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036868 | Grad Max: 0.208712 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000762 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007879 | Grad Max: 0.016302 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000339 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002105 | Grad Max: 0.004654 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001848 | Grad Max: 0.003701 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040425 | Grad Max: 0.040425 [GRADIENT NORM TOTAL] 12.4431 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.731 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.72392404 0.27607596] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.064 [MASKS] A(Pass/Fail): 713/1335 | B: 570/1286 | C: 464/1584 [LOSS Ex1] A: 0.64504 | B: 0.63970 | C: 0.63287 [LOGITS Ex2 A] Mean Abs: 1.993 | Max: 6.597 [LOSS Ex2] A: 0.13662 | B: 0.34398 | C: 0.27026 ** [JOINT LOSS] ** : 0.889486 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005167 | Grad Max: 0.147728 -> Layer: shared_layers.0.bias | Grad Mean: 0.356101 | Grad Max: 1.352393 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.006139 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000361 | Grad Max: 0.000361 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002344 | Grad Max: 0.324655 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042912 | Grad Max: 1.810015 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000330 | Grad Max: 0.013956 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020768 | Grad Max: 0.136469 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000489 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004543 | Grad Max: 0.009500 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000206 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001236 | Grad Max: 0.002712 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001157 | Grad Max: 0.002649 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024879 | Grad Max: 0.024879 [GRADIENT NORM TOTAL] 7.3188 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.813 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500969 0.49903095] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.063 [MASKS] A(Pass/Fail): 700/1348 | B: 610/1438 | C: 465/1583 [LOSS Ex1] A: 0.65121 | B: 0.63880 | C: 0.63186 [LOGITS Ex2 A] Mean Abs: 1.957 | Max: 6.219 [LOSS Ex2] A: 0.12514 | B: 0.37711 | C: 0.25767 ** [JOINT LOSS] ** : 0.893930 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004791 | Grad Max: 0.129892 -> Layer: shared_layers.0.bias | Grad Mean: 0.347710 | Grad Max: 1.445158 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005793 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000200 | Grad Max: 0.000200 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.336066 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039957 | Grad Max: 1.881503 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000316 | Grad Max: 0.010652 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020011 | Grad Max: 0.107299 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000469 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004328 | Grad Max: 0.009224 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000221 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001166 | Grad Max: 0.002809 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001067 | Grad Max: 0.002344 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022938 | Grad Max: 0.022938 [GRADIENT NORM TOTAL] 7.1866 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.557 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6785671 0.32143286] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.064 [MASKS] A(Pass/Fail): 685/1363 | B: 607/1441 | C: 462/1586 [LOSS Ex1] A: 0.64747 | B: 0.63959 | C: 0.63480 [LOGITS Ex2 A] Mean Abs: 1.938 | Max: 6.372 [LOSS Ex2] A: 0.14671 | B: 0.37949 | C: 0.26614 ** [JOINT LOSS] ** : 0.904731 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007935 | Grad Max: 0.213343 -> Layer: shared_layers.0.bias | Grad Mean: 0.527946 | Grad Max: 2.221864 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.005488 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003640 | Grad Max: 0.003640 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003317 | Grad Max: 0.470481 -> Layer: exit2_layers.0.bias | Grad Mean: 0.061513 | Grad Max: 2.570152 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000488 | Grad Max: 0.019150 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030877 | Grad Max: 0.180755 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000608 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006674 | Grad Max: 0.013177 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000322 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001803 | Grad Max: 0.004074 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001689 | Grad Max: 0.003007 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035677 | Grad Max: 0.035677 [GRADIENT NORM TOTAL] 10.3061 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.650 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6104063 0.38959375] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.064 [MASKS] A(Pass/Fail): 579/1037 | B: 609/1439 | C: 472/1576 [LOSS Ex1] A: 0.64584 | B: 0.63540 | C: 0.63704 [LOGITS Ex2 A] Mean Abs: 1.985 | Max: 5.608 [LOSS Ex2] A: 0.13653 | B: 0.35328 | C: 0.28652 ** [JOINT LOSS] ** : 0.898204 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005352 | Grad Max: 0.171098 -> Layer: shared_layers.0.bias | Grad Mean: 0.260640 | Grad Max: 0.992219 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.006644 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006080 | Grad Max: 0.006080 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001809 | Grad Max: 0.219955 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032820 | Grad Max: 1.221215 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000272 | Grad Max: 0.009316 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017013 | Grad Max: 0.090861 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000459 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003784 | Grad Max: 0.008426 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000173 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001047 | Grad Max: 0.002271 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001076 | Grad Max: 0.002201 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022025 | Grad Max: 0.022025 [GRADIENT NORM TOTAL] 5.2330 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.814 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071214 0.4928786] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.064 [MASKS] A(Pass/Fail): 696/1352 | B: 570/1286 | C: 308/1068 [LOSS Ex1] A: 0.64638 | B: 0.63959 | C: 0.63251 [LOGITS Ex2 A] Mean Abs: 2.007 | Max: 6.790 [LOSS Ex2] A: 0.14597 | B: 0.34377 | C: 0.26327 ** [JOINT LOSS] ** : 0.890498 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003567 | Grad Max: 0.179041 -> Layer: shared_layers.0.bias | Grad Mean: 0.426118 | Grad Max: 2.207880 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002256 | Grad Max: 0.006141 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006079 | Grad Max: 0.006079 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002648 | Grad Max: 0.353049 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049111 | Grad Max: 1.972742 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000377 | Grad Max: 0.014589 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024243 | Grad Max: 0.149414 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000513 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005097 | Grad Max: 0.010529 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000231 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001392 | Grad Max: 0.002890 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001251 | Grad Max: 0.002905 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028111 | Grad Max: 0.028111 [GRADIENT NORM TOTAL] 9.0322 [EPOCH SUMMARY] Train Loss: 0.8948 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8921 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 111/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.764 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51088405 0.48911592] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 691/1357 | B: 610/1438 | C: 480/1568 [LOSS Ex1] A: 0.64357 | B: 0.63869 | C: 0.63190 [LOGITS Ex2 A] Mean Abs: 2.018 | Max: 5.909 [LOSS Ex2] A: 0.14783 | B: 0.38548 | C: 0.27813 ** [JOINT LOSS] ** : 0.908535 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009625 | Grad Max: 0.255507 -> Layer: shared_layers.0.bias | Grad Mean: 0.721226 | Grad Max: 3.237003 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002286 | Grad Max: 0.006226 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004182 | Grad Max: 0.004182 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004510 | Grad Max: 0.575853 -> Layer: exit2_layers.0.bias | Grad Mean: 0.084558 | Grad Max: 3.201047 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000716 | Grad Max: 0.026963 -> Layer: exit2_layers.3.bias | Grad Mean: 0.045435 | Grad Max: 0.277940 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000970 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009641 | Grad Max: 0.020741 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000419 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002599 | Grad Max: 0.005697 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002353 | Grad Max: 0.004187 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050945 | Grad Max: 0.050945 [GRADIENT NORM TOTAL] 14.3447 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.788 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5032933 0.49670678] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 685/1363 | B: 607/1441 | C: 470/1578 [LOSS Ex1] A: 0.64260 | B: 0.63948 | C: 0.63718 [LOGITS Ex2 A] Mean Abs: 1.994 | Max: 5.948 [LOSS Ex2] A: 0.16199 | B: 0.36398 | C: 0.27408 ** [JOINT LOSS] ** : 0.906436 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006729 | Grad Max: 0.204741 -> Layer: shared_layers.0.bias | Grad Mean: 0.481005 | Grad Max: 2.135135 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.006433 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004505 | Grad Max: 0.004505 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003137 | Grad Max: 0.362131 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057665 | Grad Max: 2.034848 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000486 | Grad Max: 0.017587 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030989 | Grad Max: 0.177044 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000651 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006729 | Grad Max: 0.013501 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000316 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001809 | Grad Max: 0.004250 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001537 | Grad Max: 0.003103 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034140 | Grad Max: 0.034140 [GRADIENT NORM TOTAL] 9.7041 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.652 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5027103 0.49728975] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.063 [MASKS] A(Pass/Fail): 664/1384 | B: 609/1439 | C: 481/1567 [LOSS Ex1] A: 0.65043 | B: 0.63530 | C: 0.63098 [LOGITS Ex2 A] Mean Abs: 1.926 | Max: 6.338 [LOSS Ex2] A: 0.13853 | B: 0.33685 | C: 0.25418 ** [JOINT LOSS] ** : 0.882088 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002845 | Grad Max: 0.080772 -> Layer: shared_layers.0.bias | Grad Mean: 0.239408 | Grad Max: 1.039432 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.006943 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012597 | Grad Max: 0.012597 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001533 | Grad Max: 0.462363 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028600 | Grad Max: 2.575244 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000220 | Grad Max: 0.008477 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013924 | Grad Max: 0.079916 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000365 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002913 | Grad Max: 0.007095 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000171 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000773 | Grad Max: 0.002119 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000705 | Grad Max: 0.001999 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014804 | Grad Max: 0.014804 [GRADIENT NORM TOTAL] 5.6078 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.579 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5420046 0.45799538] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.061 [MASKS] A(Pass/Fail): 664/1384 | B: 570/1286 | C: 442/1606 [LOSS Ex1] A: 0.65037 | B: 0.63949 | C: 0.63476 [LOGITS Ex2 A] Mean Abs: 1.870 | Max: 6.342 [LOSS Ex2] A: 0.14408 | B: 0.36341 | C: 0.27349 ** [JOINT LOSS] ** : 0.901868 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003719 | Grad Max: 0.155521 -> Layer: shared_layers.0.bias | Grad Mean: 0.456565 | Grad Max: 2.118744 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.006287 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007732 | Grad Max: 0.007732 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002818 | Grad Max: 0.326501 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052901 | Grad Max: 1.802566 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000424 | Grad Max: 0.017430 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027253 | Grad Max: 0.152804 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000513 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005741 | Grad Max: 0.011432 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000271 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001555 | Grad Max: 0.003666 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001275 | Grad Max: 0.002607 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029695 | Grad Max: 0.029695 [GRADIENT NORM TOTAL] 9.3090 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.734 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7246984 0.27530158] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.064 [MASKS] A(Pass/Fail): 714/1334 | B: 611/1437 | C: 476/1572 [LOSS Ex1] A: 0.64489 | B: 0.63859 | C: 0.63645 [LOGITS Ex2 A] Mean Abs: 1.940 | Max: 5.894 [LOSS Ex2] A: 0.13820 | B: 0.37164 | C: 0.26059 ** [JOINT LOSS] ** : 0.896787 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003202 | Grad Max: 0.112500 -> Layer: shared_layers.0.bias | Grad Mean: 0.282837 | Grad Max: 1.451387 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.006303 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002889 | Grad Max: 0.002889 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001740 | Grad Max: 0.261353 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031532 | Grad Max: 1.455544 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.012486 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014570 | Grad Max: 0.102527 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000356 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003009 | Grad Max: 0.006588 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000141 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000824 | Grad Max: 0.001912 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000712 | Grad Max: 0.001765 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016579 | Grad Max: 0.016579 [GRADIENT NORM TOTAL] 5.8619 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.816 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50094986 0.49905017] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.063 [MASKS] A(Pass/Fail): 700/1348 | B: 607/1441 | C: 429/1619 [LOSS Ex1] A: 0.65108 | B: 0.63938 | C: 0.63489 [LOGITS Ex2 A] Mean Abs: 1.971 | Max: 5.999 [LOSS Ex2] A: 0.13091 | B: 0.36461 | C: 0.25351 ** [JOINT LOSS] ** : 0.891464 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003958 | Grad Max: 0.135044 -> Layer: shared_layers.0.bias | Grad Mean: 0.339356 | Grad Max: 1.668382 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.005665 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005666 | Grad Max: 0.005666 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.331919 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040105 | Grad Max: 1.845672 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.011279 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019853 | Grad Max: 0.112071 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000404 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004218 | Grad Max: 0.008643 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000211 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001152 | Grad Max: 0.002587 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001019 | Grad Max: 0.002718 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022894 | Grad Max: 0.022894 [GRADIENT NORM TOTAL] 7.3015 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.559 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6790944 0.3209057] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.064 [MASKS] A(Pass/Fail): 685/1363 | B: 609/1439 | C: 477/1571 [LOSS Ex1] A: 0.64733 | B: 0.63520 | C: 0.62957 [LOGITS Ex2 A] Mean Abs: 1.984 | Max: 5.633 [LOSS Ex2] A: 0.15164 | B: 0.34770 | C: 0.28125 ** [JOINT LOSS] ** : 0.897560 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004880 | Grad Max: 0.196758 -> Layer: shared_layers.0.bias | Grad Mean: 0.467317 | Grad Max: 2.406163 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002280 | Grad Max: 0.005847 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000131 | Grad Max: 0.000131 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002904 | Grad Max: 0.483871 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054227 | Grad Max: 2.705593 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000420 | Grad Max: 0.015271 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026866 | Grad Max: 0.153475 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000606 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005655 | Grad Max: 0.012051 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000291 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001531 | Grad Max: 0.003694 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001423 | Grad Max: 0.002977 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030999 | Grad Max: 0.030999 [GRADIENT NORM TOTAL] 9.9943 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.653 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6106794 0.38932064] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.064 [MASKS] A(Pass/Fail): 580/1036 | B: 570/1286 | C: 443/1605 [LOSS Ex1] A: 0.64569 | B: 0.63939 | C: 0.63524 [LOGITS Ex2 A] Mean Abs: 1.995 | Max: 5.806 [LOSS Ex2] A: 0.13203 | B: 0.33731 | C: 0.26325 ** [JOINT LOSS] ** : 0.884306 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001733 | Grad Max: 0.036179 -> Layer: shared_layers.0.bias | Grad Mean: 0.096690 | Grad Max: 0.375981 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.006398 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009293 | Grad Max: 0.009293 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000691 | Grad Max: 0.153999 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012100 | Grad Max: 0.859100 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000067 | Grad Max: 0.003797 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003907 | Grad Max: 0.029519 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000207 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000825 | Grad Max: 0.003284 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000071 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000225 | Grad Max: 0.000770 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000491 | Grad Max: 0.001268 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004058 | Grad Max: 0.004058 [GRADIENT NORM TOTAL] 2.4958 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.816 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071514 0.49284855] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.065 [MASKS] A(Pass/Fail): 696/1352 | B: 612/1436 | C: 451/1597 [LOSS Ex1] A: 0.64623 | B: 0.63849 | C: 0.63573 [LOGITS Ex2 A] Mean Abs: 1.953 | Max: 7.965 [LOSS Ex2] A: 0.13166 | B: 0.37455 | C: 0.29081 ** [JOINT LOSS] ** : 0.905823 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004943 | Grad Max: 0.133292 -> Layer: shared_layers.0.bias | Grad Mean: 0.380450 | Grad Max: 1.735773 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.005838 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000027 | Grad Max: 0.000027 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002453 | Grad Max: 0.388551 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045555 | Grad Max: 2.201898 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000371 | Grad Max: 0.014613 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023670 | Grad Max: 0.151248 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000484 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005075 | Grad Max: 0.010361 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000266 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001400 | Grad Max: 0.003334 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001317 | Grad Max: 0.002517 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028701 | Grad Max: 0.028701 [GRADIENT NORM TOTAL] 8.0235 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.767 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108718 0.48912823] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 691/1357 | B: 607/1441 | C: 476/1572 [LOSS Ex1] A: 0.64341 | B: 0.63927 | C: 0.63174 [LOGITS Ex2 A] Mean Abs: 1.959 | Max: 5.904 [LOSS Ex2] A: 0.13682 | B: 0.37096 | C: 0.25022 ** [JOINT LOSS] ** : 0.890808 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004694 | Grad Max: 0.128897 -> Layer: shared_layers.0.bias | Grad Mean: 0.378737 | Grad Max: 1.735713 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002245 | Grad Max: 0.006334 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001403 | Grad Max: 0.001403 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002419 | Grad Max: 0.324462 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045003 | Grad Max: 1.837222 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000373 | Grad Max: 0.012402 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023723 | Grad Max: 0.111841 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000516 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005092 | Grad Max: 0.010444 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000236 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001376 | Grad Max: 0.003200 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001203 | Grad Max: 0.002495 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026488 | Grad Max: 0.026488 [GRADIENT NORM TOTAL] 7.6823 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.791 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.503335 0.49666503] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 686/1362 | B: 610/1438 | C: 487/1561 [LOSS Ex1] A: 0.64243 | B: 0.63508 | C: 0.62983 [LOGITS Ex2 A] Mean Abs: 1.948 | Max: 8.153 [LOSS Ex2] A: 0.14764 | B: 0.33922 | C: 0.25462 ** [JOINT LOSS] ** : 0.882939 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002557 | Grad Max: 0.111608 -> Layer: shared_layers.0.bias | Grad Mean: 0.068875 | Grad Max: 0.334585 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002364 | Grad Max: 0.006516 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003383 | Grad Max: 0.003383 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000727 | Grad Max: 0.141815 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012092 | Grad Max: 0.790715 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000089 | Grad Max: 0.004432 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005039 | Grad Max: 0.026763 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000167 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001151 | Grad Max: 0.003682 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000129 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000318 | Grad Max: 0.001076 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000427 | Grad Max: 0.001312 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006548 | Grad Max: 0.006548 [GRADIENT NORM TOTAL] 1.9522 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.655 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50270396 0.49729607] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.063 [MASKS] A(Pass/Fail): 665/1383 | B: 571/1285 | C: 455/1593 [LOSS Ex1] A: 0.65028 | B: 0.63926 | C: 0.63460 [LOGITS Ex2 A] Mean Abs: 1.938 | Max: 6.523 [LOSS Ex2] A: 0.13160 | B: 0.33711 | C: 0.26267 ** [JOINT LOSS] ** : 0.885174 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001862 | Grad Max: 0.044508 -> Layer: shared_layers.0.bias | Grad Mean: 0.051439 | Grad Max: 0.378444 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.005721 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000072 | Grad Max: 0.000072 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000473 | Grad Max: 0.113891 -> Layer: exit2_layers.0.bias | Grad Mean: 0.007753 | Grad Max: 0.615574 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.003144 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001652 | Grad Max: 0.016238 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000137 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000313 | Grad Max: 0.002285 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000102 | Grad Max: 0.000503 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000530 | Grad Max: 0.001092 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000108 | Grad Max: 0.000108 [GRADIENT NORM TOTAL] 1.6760 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.109 | Max: 0.582 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54206556 0.45793444] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.061 [MASKS] A(Pass/Fail): 664/1384 | B: 613/1435 | C: 473/1575 [LOSS Ex1] A: 0.65021 | B: 0.63835 | C: 0.62861 [LOGITS Ex2 A] Mean Abs: 1.929 | Max: 5.617 [LOSS Ex2] A: 0.14302 | B: 0.36294 | C: 0.26471 ** [JOINT LOSS] ** : 0.895944 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002303 | Grad Max: 0.072302 -> Layer: shared_layers.0.bias | Grad Mean: 0.054322 | Grad Max: 0.236491 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006883 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010335 | Grad Max: 0.010335 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000572 | Grad Max: 0.152602 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009326 | Grad Max: 0.853370 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.002580 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001896 | Grad Max: 0.014788 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000113 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000343 | Grad Max: 0.002129 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000097 | Grad Max: 0.000511 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000316 | Grad Max: 0.000927 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001525 | Grad Max: 0.001525 [GRADIENT NORM TOTAL] 1.8476 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.737 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7256309 0.2743691] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.064 [MASKS] A(Pass/Fail): 716/1332 | B: 607/1441 | C: 326/1050 [LOSS Ex1] A: 0.64470 | B: 0.63911 | C: 0.63271 [LOGITS Ex2 A] Mean Abs: 1.976 | Max: 7.155 [LOSS Ex2] A: 0.13035 | B: 0.36530 | C: 0.26534 ** [JOINT LOSS] ** : 0.892504 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003200 | Grad Max: 0.087522 -> Layer: shared_layers.0.bias | Grad Mean: 0.156629 | Grad Max: 0.570005 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002304 | Grad Max: 0.006494 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009305 | Grad Max: 0.009305 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001173 | Grad Max: 0.181303 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020952 | Grad Max: 1.023808 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000155 | Grad Max: 0.008588 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009614 | Grad Max: 0.067635 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000263 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002088 | Grad Max: 0.005269 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000115 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000563 | Grad Max: 0.001377 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000484 | Grad Max: 0.001674 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010785 | Grad Max: 0.010785 [GRADIENT NORM TOTAL] 3.5957 [EPOCH SUMMARY] Train Loss: 0.8944 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8754 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8789 -> New: 0.8754) ############################## EPOCH 112/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.819 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009151 0.49908492] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.063 [MASKS] A(Pass/Fail): 700/1348 | B: 611/1437 | C: 464/1584 [LOSS Ex1] A: 0.65089 | B: 0.63489 | C: 0.63274 [LOGITS Ex2 A] Mean Abs: 1.994 | Max: 5.873 [LOSS Ex2] A: 0.13056 | B: 0.34979 | C: 0.25660 ** [JOINT LOSS] ** : 0.885155 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002676 | Grad Max: 0.066260 -> Layer: shared_layers.0.bias | Grad Mean: 0.190054 | Grad Max: 0.937510 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002220 | Grad Max: 0.005748 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003692 | Grad Max: 0.003692 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001199 | Grad Max: 0.279875 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022140 | Grad Max: 1.575784 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.008840 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010490 | Grad Max: 0.078916 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000269 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002207 | Grad Max: 0.005449 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000129 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000588 | Grad Max: 0.001473 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000557 | Grad Max: 0.001714 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011437 | Grad Max: 0.011437 [GRADIENT NORM TOTAL] 4.4394 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.563 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.67990094 0.32009903] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.064 [MASKS] A(Pass/Fail): 686/1362 | B: 571/1285 | C: 454/1594 [LOSS Ex1] A: 0.64711 | B: 0.63907 | C: 0.63883 [LOGITS Ex2 A] Mean Abs: 1.983 | Max: 6.300 [LOSS Ex2] A: 0.14796 | B: 0.34606 | C: 0.28613 ** [JOINT LOSS] ** : 0.901714 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003450 | Grad Max: 0.071976 -> Layer: shared_layers.0.bias | Grad Mean: 0.207580 | Grad Max: 1.068628 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.005720 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001367 | Grad Max: 0.001367 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001394 | Grad Max: 0.253972 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025775 | Grad Max: 1.437107 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000216 | Grad Max: 0.007683 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013714 | Grad Max: 0.074424 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000324 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002851 | Grad Max: 0.006456 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000149 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000754 | Grad Max: 0.001831 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000649 | Grad Max: 0.001717 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013711 | Grad Max: 0.013711 [GRADIENT NORM TOTAL] 4.3821 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.657 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61107475 0.38892522] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.064 [MASKS] A(Pass/Fail): 580/1036 | B: 613/1435 | C: 466/1582 [LOSS Ex1] A: 0.64545 | B: 0.63815 | C: 0.63108 [LOGITS Ex2 A] Mean Abs: 2.038 | Max: 6.307 [LOSS Ex2] A: 0.13850 | B: 0.36246 | C: 0.25428 ** [JOINT LOSS] ** : 0.889976 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003864 | Grad Max: 0.106781 -> Layer: shared_layers.0.bias | Grad Mean: 0.295825 | Grad Max: 1.163207 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.006167 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004428 | Grad Max: 0.004428 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001963 | Grad Max: 0.267031 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036155 | Grad Max: 1.493388 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000283 | Grad Max: 0.011443 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018076 | Grad Max: 0.096691 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000433 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003917 | Grad Max: 0.008194 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000217 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001052 | Grad Max: 0.002664 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000937 | Grad Max: 0.002391 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020422 | Grad Max: 0.020422 [GRADIENT NORM TOTAL] 6.1395 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.821 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072226 0.4927774] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.065 [MASKS] A(Pass/Fail): 696/1352 | B: 608/1440 | C: 485/1563 [LOSS Ex1] A: 0.64598 | B: 0.63892 | C: 0.63083 [LOGITS Ex2 A] Mean Abs: 2.035 | Max: 5.923 [LOSS Ex2] A: 0.13027 | B: 0.36549 | C: 0.27201 ** [JOINT LOSS] ** : 0.894502 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005251 | Grad Max: 0.131854 -> Layer: shared_layers.0.bias | Grad Mean: 0.353632 | Grad Max: 1.641911 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002209 | Grad Max: 0.005574 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000996 | Grad Max: 0.000997 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002383 | Grad Max: 0.322319 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043547 | Grad Max: 1.787200 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000341 | Grad Max: 0.011886 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021614 | Grad Max: 0.130834 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000434 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004652 | Grad Max: 0.009499 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000229 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001244 | Grad Max: 0.003071 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001080 | Grad Max: 0.002379 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023258 | Grad Max: 0.023258 [GRADIENT NORM TOTAL] 7.5684 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.773 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51079583 0.48920414] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 691/1357 | B: 611/1437 | C: 491/1557 [LOSS Ex1] A: 0.64315 | B: 0.63470 | C: 0.63317 [LOGITS Ex2 A] Mean Abs: 2.001 | Max: 5.650 [LOSS Ex2] A: 0.13753 | B: 0.34404 | C: 0.26349 ** [JOINT LOSS] ** : 0.885359 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001715 | Grad Max: 0.032374 -> Layer: shared_layers.0.bias | Grad Mean: 0.065595 | Grad Max: 0.247916 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002324 | Grad Max: 0.006705 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005480 | Grad Max: 0.005480 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000630 | Grad Max: 0.153016 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010936 | Grad Max: 0.863790 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.004048 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003050 | Grad Max: 0.031550 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000189 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000620 | Grad Max: 0.002660 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000058 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000178 | Grad Max: 0.000656 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000333 | Grad Max: 0.001035 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003475 | Grad Max: 0.003475 [GRADIENT NORM TOTAL] 1.9719 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.797 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5034577 0.49654227] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 688/1360 | B: 571/1285 | C: 482/1566 [LOSS Ex1] A: 0.64215 | B: 0.63888 | C: 0.63165 [LOGITS Ex2 A] Mean Abs: 1.997 | Max: 7.277 [LOSS Ex2] A: 0.15519 | B: 0.34744 | C: 0.24236 ** [JOINT LOSS] ** : 0.885891 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003234 | Grad Max: 0.105935 -> Layer: shared_layers.0.bias | Grad Mean: 0.203712 | Grad Max: 0.975250 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.006200 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001496 | Grad Max: 0.001496 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001321 | Grad Max: 0.144062 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024017 | Grad Max: 0.742641 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000191 | Grad Max: 0.009315 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011967 | Grad Max: 0.078282 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000303 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002383 | Grad Max: 0.005879 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000140 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000627 | Grad Max: 0.001618 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000554 | Grad Max: 0.001441 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011711 | Grad Max: 0.011711 [GRADIENT NORM TOTAL] 4.0917 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.660 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50271994 0.49728003] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.063 [MASKS] A(Pass/Fail): 667/1381 | B: 614/1434 | C: 456/1592 [LOSS Ex1] A: 0.65001 | B: 0.63795 | C: 0.63203 [LOGITS Ex2 A] Mean Abs: 1.980 | Max: 6.936 [LOSS Ex2] A: 0.13571 | B: 0.36814 | C: 0.27387 ** [JOINT LOSS] ** : 0.899238 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.053875 -> Layer: shared_layers.0.bias | Grad Mean: 0.114385 | Grad Max: 0.594906 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005993 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006627 | Grad Max: 0.006627 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000738 | Grad Max: 0.407275 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012807 | Grad Max: 2.284618 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003758 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002323 | Grad Max: 0.021632 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000166 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000402 | Grad Max: 0.002437 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000064 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000096 | Grad Max: 0.000557 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000239 | Grad Max: 0.000725 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000378 | Grad Max: 0.000378 [GRADIENT NORM TOTAL] 3.7333 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.587 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5421711 0.45782894] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.549 | Std: 0.061 [MASKS] A(Pass/Fail): 664/1384 | B: 608/1440 | C: 435/1613 [LOSS Ex1] A: 0.64995 | B: 0.63871 | C: 0.63554 [LOGITS Ex2 A] Mean Abs: 1.990 | Max: 6.168 [LOSS Ex2] A: 0.14144 | B: 0.36418 | C: 0.25021 ** [JOINT LOSS] ** : 0.893343 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002472 | Grad Max: 0.089223 -> Layer: shared_layers.0.bias | Grad Mean: 0.239104 | Grad Max: 1.088072 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.007028 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013601 | Grad Max: 0.013601 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001575 | Grad Max: 0.248893 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029019 | Grad Max: 1.374720 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.009452 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013204 | Grad Max: 0.093084 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000283 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002696 | Grad Max: 0.006036 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000152 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000709 | Grad Max: 0.001711 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000609 | Grad Max: 0.001701 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013418 | Grad Max: 0.013418 [GRADIENT NORM TOTAL] 5.2861 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.741 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7270694 0.27293068] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.065 [MASKS] A(Pass/Fail): 716/1332 | B: 611/1437 | C: 457/1591 [LOSS Ex1] A: 0.64443 | B: 0.63448 | C: 0.62980 [LOGITS Ex2 A] Mean Abs: 2.023 | Max: 6.096 [LOSS Ex2] A: 0.13692 | B: 0.34307 | C: 0.25782 ** [JOINT LOSS] ** : 0.882177 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.078947 -> Layer: shared_layers.0.bias | Grad Mean: 0.133900 | Grad Max: 0.780460 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002344 | Grad Max: 0.006276 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006137 | Grad Max: 0.006137 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000929 | Grad Max: 0.201871 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016759 | Grad Max: 1.122662 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.005642 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007131 | Grad Max: 0.047612 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000225 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001475 | Grad Max: 0.003994 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000103 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000396 | Grad Max: 0.001101 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000431 | Grad Max: 0.001378 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008272 | Grad Max: 0.008272 [GRADIENT NORM TOTAL] 3.3879 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.824 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009236 0.49907643] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 701/1347 | B: 571/1285 | C: 480/1568 [LOSS Ex1] A: 0.65064 | B: 0.63866 | C: 0.63252 [LOGITS Ex2 A] Mean Abs: 2.013 | Max: 6.156 [LOSS Ex2] A: 0.13074 | B: 0.36159 | C: 0.27079 ** [JOINT LOSS] ** : 0.894979 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005684 | Grad Max: 0.177024 -> Layer: shared_layers.0.bias | Grad Mean: 0.388773 | Grad Max: 1.814867 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.005789 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001777 | Grad Max: 0.001777 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002536 | Grad Max: 0.329032 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046437 | Grad Max: 1.835523 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000378 | Grad Max: 0.010614 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024109 | Grad Max: 0.103904 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000541 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005156 | Grad Max: 0.011330 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000277 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001371 | Grad Max: 0.003118 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001224 | Grad Max: 0.002398 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026363 | Grad Max: 0.026363 [GRADIENT NORM TOTAL] 7.9500 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.567 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6809912 0.3190088] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.065 [MASKS] A(Pass/Fail): 688/1360 | B: 615/1433 | C: 486/1562 [LOSS Ex1] A: 0.64685 | B: 0.63775 | C: 0.62877 [LOGITS Ex2 A] Mean Abs: 1.996 | Max: 6.321 [LOSS Ex2] A: 0.13863 | B: 0.37883 | C: 0.26521 ** [JOINT LOSS] ** : 0.898682 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006835 | Grad Max: 0.165496 -> Layer: shared_layers.0.bias | Grad Mean: 0.480119 | Grad Max: 2.233066 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.006170 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003412 | Grad Max: 0.003412 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003239 | Grad Max: 0.420965 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060280 | Grad Max: 2.383028 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000477 | Grad Max: 0.017146 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030583 | Grad Max: 0.169512 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000618 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006539 | Grad Max: 0.013409 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000324 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001742 | Grad Max: 0.003966 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001636 | Grad Max: 0.003128 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033659 | Grad Max: 0.033659 [GRADIENT NORM TOTAL] 10.0143 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.662 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6117492 0.3882508] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.065 [MASKS] A(Pass/Fail): 580/1036 | B: 608/1440 | C: 467/1581 [LOSS Ex1] A: 0.64519 | B: 0.63852 | C: 0.63333 [LOGITS Ex2 A] Mean Abs: 2.072 | Max: 5.854 [LOSS Ex2] A: 0.13299 | B: 0.36550 | C: 0.24724 ** [JOINT LOSS] ** : 0.887586 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001739 | Grad Max: 0.038771 -> Layer: shared_layers.0.bias | Grad Mean: 0.067468 | Grad Max: 0.246020 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.006164 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006282 | Grad Max: 0.006282 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000601 | Grad Max: 0.176060 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010358 | Grad Max: 0.983622 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.003067 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002130 | Grad Max: 0.026329 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000129 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000412 | Grad Max: 0.002555 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000094 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000123 | Grad Max: 0.000751 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000314 | Grad Max: 0.001325 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003497 | Grad Max: 0.003497 [GRADIENT NORM TOTAL] 2.3230 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.826 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072111 0.49278888] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.065 [MASKS] A(Pass/Fail): 696/1352 | B: 611/1437 | C: 438/1610 [LOSS Ex1] A: 0.64573 | B: 0.63428 | C: 0.63619 [LOGITS Ex2 A] Mean Abs: 2.078 | Max: 6.655 [LOSS Ex2] A: 0.13376 | B: 0.35517 | C: 0.28008 ** [JOINT LOSS] ** : 0.895069 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006780 | Grad Max: 0.190956 -> Layer: shared_layers.0.bias | Grad Mean: 0.515983 | Grad Max: 2.441994 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.006105 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003222 | Grad Max: 0.003222 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003459 | Grad Max: 0.422291 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064108 | Grad Max: 2.342909 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000498 | Grad Max: 0.017506 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031910 | Grad Max: 0.166072 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000703 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006796 | Grad Max: 0.014496 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000342 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001797 | Grad Max: 0.004556 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001566 | Grad Max: 0.003043 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033286 | Grad Max: 0.033286 [GRADIENT NORM TOTAL] 10.8763 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.778 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51085913 0.4891409 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.065 [MASKS] A(Pass/Fail): 692/1356 | B: 572/1284 | C: 319/1057 [LOSS Ex1] A: 0.64288 | B: 0.63848 | C: 0.62795 [LOGITS Ex2 A] Mean Abs: 2.076 | Max: 5.757 [LOSS Ex2] A: 0.14505 | B: 0.34675 | C: 0.28352 ** [JOINT LOSS] ** : 0.894876 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008183 | Grad Max: 0.198739 -> Layer: shared_layers.0.bias | Grad Mean: 0.535444 | Grad Max: 2.446313 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002289 | Grad Max: 0.006253 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002604 | Grad Max: 0.002604 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003584 | Grad Max: 0.428690 -> Layer: exit2_layers.0.bias | Grad Mean: 0.066546 | Grad Max: 2.426659 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000521 | Grad Max: 0.019291 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033356 | Grad Max: 0.187273 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000714 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007109 | Grad Max: 0.014969 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000340 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001890 | Grad Max: 0.004356 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001717 | Grad Max: 0.003316 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036102 | Grad Max: 0.036102 [GRADIENT NORM TOTAL] 11.0627 [EPOCH SUMMARY] Train Loss: 0.8920 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8710 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8754 -> New: 0.8710) ############################## EPOCH 113/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.803 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50348747 0.4965125 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.065 [MASKS] A(Pass/Fail): 690/1358 | B: 615/1433 | C: 502/1546 [LOSS Ex1] A: 0.64189 | B: 0.63757 | C: 0.63090 [LOGITS Ex2 A] Mean Abs: 2.027 | Max: 7.531 [LOSS Ex2] A: 0.15358 | B: 0.36731 | C: 0.25535 ** [JOINT LOSS] ** : 0.895535 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004572 | Grad Max: 0.155407 -> Layer: shared_layers.0.bias | Grad Mean: 0.100301 | Grad Max: 0.396994 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002267 | Grad Max: 0.006556 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000647 | Grad Max: 0.000647 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000913 | Grad Max: 0.189057 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015164 | Grad Max: 0.989198 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.004667 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006382 | Grad Max: 0.033952 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000263 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001505 | Grad Max: 0.004309 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000129 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000394 | Grad Max: 0.001401 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000383 | Grad Max: 0.001245 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007181 | Grad Max: 0.007181 [GRADIENT NORM TOTAL] 2.5892 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.664 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026603 0.4973397] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.064 [MASKS] A(Pass/Fail): 667/1381 | B: 608/1440 | C: 482/1566 [LOSS Ex1] A: 0.64978 | B: 0.63834 | C: 0.62979 [LOGITS Ex2 A] Mean Abs: 1.940 | Max: 6.582 [LOSS Ex2] A: 0.13784 | B: 0.38869 | C: 0.25689 ** [JOINT LOSS] ** : 0.900442 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008436 | Grad Max: 0.227072 -> Layer: shared_layers.0.bias | Grad Mean: 0.696499 | Grad Max: 3.126529 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005735 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004758 | Grad Max: 0.004758 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004410 | Grad Max: 0.679991 -> Layer: exit2_layers.0.bias | Grad Mean: 0.082163 | Grad Max: 3.798304 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000630 | Grad Max: 0.021641 -> Layer: exit2_layers.3.bias | Grad Mean: 0.040619 | Grad Max: 0.221516 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000777 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008662 | Grad Max: 0.017507 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000423 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002310 | Grad Max: 0.005449 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002088 | Grad Max: 0.003639 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044401 | Grad Max: 0.044401 [GRADIENT NORM TOTAL] 14.5566 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.110 | Max: 0.591 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5423879 0.4576121] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.062 [MASKS] A(Pass/Fail): 665/1383 | B: 611/1437 | C: 469/1579 [LOSS Ex1] A: 0.64973 | B: 0.63410 | C: 0.63339 [LOGITS Ex2 A] Mean Abs: 1.913 | Max: 5.914 [LOSS Ex2] A: 0.15774 | B: 0.38131 | C: 0.27995 ** [JOINT LOSS] ** : 0.912072 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014229 | Grad Max: 0.398510 -> Layer: shared_layers.0.bias | Grad Mean: 0.880956 | Grad Max: 3.547540 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.006079 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008153 | Grad Max: 0.008153 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005706 | Grad Max: 0.734843 -> Layer: exit2_layers.0.bias | Grad Mean: 0.105726 | Grad Max: 4.081692 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000841 | Grad Max: 0.027193 -> Layer: exit2_layers.3.bias | Grad Mean: 0.053681 | Grad Max: 0.272178 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000108 | Grad Max: 0.001112 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011534 | Grad Max: 0.022775 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000552 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003066 | Grad Max: 0.007079 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002812 | Grad Max: 0.005303 -> Layer: exit2_layers.12.bias | Grad Mean: 0.058902 | Grad Max: 0.058902 [GRADIENT NORM TOTAL] 17.7423 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.746 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.72842014 0.27157986] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.065 [MASKS] A(Pass/Fail): 719/1329 | B: 572/1284 | C: 468/1580 [LOSS Ex1] A: 0.64418 | B: 0.63830 | C: 0.63218 [LOGITS Ex2 A] Mean Abs: 1.977 | Max: 6.580 [LOSS Ex2] A: 0.13770 | B: 0.36371 | C: 0.25562 ** [JOINT LOSS] ** : 0.890565 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010567 | Grad Max: 0.324502 -> Layer: shared_layers.0.bias | Grad Mean: 0.632645 | Grad Max: 2.534528 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002287 | Grad Max: 0.006362 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007824 | Grad Max: 0.007824 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004068 | Grad Max: 0.563412 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074682 | Grad Max: 3.142462 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000596 | Grad Max: 0.018198 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037884 | Grad Max: 0.187550 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000814 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008213 | Grad Max: 0.016507 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000417 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002164 | Grad Max: 0.005097 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001916 | Grad Max: 0.003397 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040542 | Grad Max: 0.040542 [GRADIENT NORM TOTAL] 12.6861 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.829 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50093406 0.49906594] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 702/1346 | B: 615/1433 | C: 442/1606 [LOSS Ex1] A: 0.65043 | B: 0.63741 | C: 0.63290 [LOGITS Ex2 A] Mean Abs: 2.022 | Max: 5.851 [LOSS Ex2] A: 0.13832 | B: 0.36000 | C: 0.26355 ** [JOINT LOSS] ** : 0.894203 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005077 | Grad Max: 0.214376 -> Layer: shared_layers.0.bias | Grad Mean: 0.085152 | Grad Max: 0.301566 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002093 | Grad Max: 0.005797 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001161 | Grad Max: 0.001161 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000878 | Grad Max: 0.154376 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012946 | Grad Max: 0.879697 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.003350 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002527 | Grad Max: 0.023689 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000210 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000422 | Grad Max: 0.002151 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000090 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000115 | Grad Max: 0.000829 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000271 | Grad Max: 0.001060 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002321 | Grad Max: 0.002321 [GRADIENT NORM TOTAL] 2.4129 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.570 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.68188864 0.3181113 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.065 [MASKS] A(Pass/Fail): 690/1358 | B: 608/1440 | C: 453/1595 [LOSS Ex1] A: 0.64662 | B: 0.63817 | C: 0.63642 [LOGITS Ex2 A] Mean Abs: 2.037 | Max: 5.938 [LOSS Ex2] A: 0.15485 | B: 0.37746 | C: 0.29488 ** [JOINT LOSS] ** : 0.916136 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005568 | Grad Max: 0.237527 -> Layer: shared_layers.0.bias | Grad Mean: 0.622633 | Grad Max: 3.022476 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.005626 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006132 | Grad Max: 0.006132 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003972 | Grad Max: 0.598495 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074318 | Grad Max: 3.335666 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000569 | Grad Max: 0.021732 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037139 | Grad Max: 0.199512 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000786 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007727 | Grad Max: 0.015735 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000380 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002031 | Grad Max: 0.004809 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001711 | Grad Max: 0.003129 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037697 | Grad Max: 0.037697 [GRADIENT NORM TOTAL] 13.4832 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.667 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61221707 0.3877829 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.065 [MASKS] A(Pass/Fail): 581/1035 | B: 611/1437 | C: 447/1601 [LOSS Ex1] A: 0.64496 | B: 0.63394 | C: 0.63197 [LOGITS Ex2 A] Mean Abs: 2.083 | Max: 6.593 [LOSS Ex2] A: 0.14701 | B: 0.35836 | C: 0.27406 ** [JOINT LOSS] ** : 0.896768 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009897 | Grad Max: 0.299263 -> Layer: shared_layers.0.bias | Grad Mean: 0.758540 | Grad Max: 3.683718 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.006310 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001684 | Grad Max: 0.001684 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004872 | Grad Max: 0.605861 -> Layer: exit2_layers.0.bias | Grad Mean: 0.091067 | Grad Max: 3.386441 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000714 | Grad Max: 0.026437 -> Layer: exit2_layers.3.bias | Grad Mean: 0.046253 | Grad Max: 0.260030 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000911 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009802 | Grad Max: 0.018922 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000455 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002615 | Grad Max: 0.005867 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002276 | Grad Max: 0.004143 -> Layer: exit2_layers.12.bias | Grad Mean: 0.049546 | Grad Max: 0.049546 [GRADIENT NORM TOTAL] 15.5461 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.831 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072426 0.49275738] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.066 [MASKS] A(Pass/Fail): 696/1352 | B: 572/1284 | C: 497/1551 [LOSS Ex1] A: 0.64550 | B: 0.63815 | C: 0.63035 [LOGITS Ex2 A] Mean Abs: 2.055 | Max: 7.321 [LOSS Ex2] A: 0.14457 | B: 0.33981 | C: 0.26385 ** [JOINT LOSS] ** : 0.887411 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009890 | Grad Max: 0.321778 -> Layer: shared_layers.0.bias | Grad Mean: 0.439954 | Grad Max: 1.897734 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002244 | Grad Max: 0.006134 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005079 | Grad Max: 0.005079 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003134 | Grad Max: 0.433963 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056906 | Grad Max: 2.419709 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000442 | Grad Max: 0.014123 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027955 | Grad Max: 0.140769 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000615 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006173 | Grad Max: 0.012685 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000289 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001664 | Grad Max: 0.003701 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001524 | Grad Max: 0.003333 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032415 | Grad Max: 0.032415 [GRADIENT NORM TOTAL] 9.4027 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.783 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108184 0.4891816] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.065 [MASKS] A(Pass/Fail): 692/1356 | B: 615/1433 | C: 450/1598 [LOSS Ex1] A: 0.64265 | B: 0.63726 | C: 0.63252 [LOGITS Ex2 A] Mean Abs: 1.979 | Max: 5.955 [LOSS Ex2] A: 0.14553 | B: 0.37861 | C: 0.27177 ** [JOINT LOSS] ** : 0.902781 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004789 | Grad Max: 0.145140 -> Layer: shared_layers.0.bias | Grad Mean: 0.358868 | Grad Max: 1.780424 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006322 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001109 | Grad Max: 0.001109 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002299 | Grad Max: 0.276873 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041123 | Grad Max: 1.547852 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000327 | Grad Max: 0.013432 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021051 | Grad Max: 0.134929 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000427 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004195 | Grad Max: 0.008795 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000189 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001109 | Grad Max: 0.002538 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000868 | Grad Max: 0.001871 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020812 | Grad Max: 0.020812 [GRADIENT NORM TOTAL] 7.3293 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.807 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50357676 0.4964232 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.065 [MASKS] A(Pass/Fail): 690/1358 | B: 608/1440 | C: 491/1557 [LOSS Ex1] A: 0.64166 | B: 0.63803 | C: 0.62646 [LOGITS Ex2 A] Mean Abs: 1.939 | Max: 6.349 [LOSS Ex2] A: 0.14867 | B: 0.38413 | C: 0.26003 ** [JOINT LOSS] ** : 0.899663 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006551 | Grad Max: 0.214993 -> Layer: shared_layers.0.bias | Grad Mean: 0.570267 | Grad Max: 2.828212 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002332 | Grad Max: 0.006755 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004908 | Grad Max: 0.004908 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003621 | Grad Max: 0.425587 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065756 | Grad Max: 2.309348 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000538 | Grad Max: 0.020422 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034806 | Grad Max: 0.199280 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000750 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006997 | Grad Max: 0.014169 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000318 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001858 | Grad Max: 0.004365 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001531 | Grad Max: 0.002845 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035202 | Grad Max: 0.035202 [GRADIENT NORM TOTAL] 11.6467 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.668 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026857 0.4973143] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.550 | Std: 0.064 [MASKS] A(Pass/Fail): 667/1381 | B: 611/1437 | C: 470/1578 [LOSS Ex1] A: 0.64958 | B: 0.63380 | C: 0.63192 [LOGITS Ex2 A] Mean Abs: 1.919 | Max: 6.080 [LOSS Ex2] A: 0.13885 | B: 0.34897 | C: 0.23909 ** [JOINT LOSS] ** : 0.880740 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003177 | Grad Max: 0.103940 -> Layer: shared_layers.0.bias | Grad Mean: 0.302691 | Grad Max: 1.559144 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.005991 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004328 | Grad Max: 0.004328 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001983 | Grad Max: 0.346079 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036305 | Grad Max: 1.909710 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000295 | Grad Max: 0.013076 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019025 | Grad Max: 0.116565 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000421 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003837 | Grad Max: 0.009086 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000205 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000986 | Grad Max: 0.002707 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000788 | Grad Max: 0.002286 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017306 | Grad Max: 0.017306 [GRADIENT NORM TOTAL] 6.6725 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.594 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54235786 0.45764217] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.062 [MASKS] A(Pass/Fail): 665/1383 | B: 572/1284 | C: 453/1595 [LOSS Ex1] A: 0.64954 | B: 0.63802 | C: 0.63210 [LOGITS Ex2 A] Mean Abs: 1.949 | Max: 6.617 [LOSS Ex2] A: 0.14516 | B: 0.34171 | C: 0.26983 ** [JOINT LOSS] ** : 0.892118 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007279 | Grad Max: 0.246013 -> Layer: shared_layers.0.bias | Grad Mean: 0.271856 | Grad Max: 0.807233 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005665 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003825 | Grad Max: 0.003825 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001922 | Grad Max: 0.239517 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033549 | Grad Max: 1.265776 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000260 | Grad Max: 0.008187 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016345 | Grad Max: 0.076572 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000439 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003753 | Grad Max: 0.008153 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000175 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001028 | Grad Max: 0.002199 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000966 | Grad Max: 0.002351 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020530 | Grad Max: 0.020530 [GRADIENT NORM TOTAL] 5.4698 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.750 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7293593 0.2706407] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.065 [MASKS] A(Pass/Fail): 719/1329 | B: 615/1433 | C: 463/1585 [LOSS Ex1] A: 0.64399 | B: 0.63712 | C: 0.63292 [LOGITS Ex2 A] Mean Abs: 1.998 | Max: 6.421 [LOSS Ex2] A: 0.13632 | B: 0.36559 | C: 0.27684 ** [JOINT LOSS] ** : 0.897595 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008374 | Grad Max: 0.252368 -> Layer: shared_layers.0.bias | Grad Mean: 0.467190 | Grad Max: 2.051966 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.006408 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006166 | Grad Max: 0.006166 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003165 | Grad Max: 0.400174 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057902 | Grad Max: 2.235495 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000447 | Grad Max: 0.013809 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028711 | Grad Max: 0.143694 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000603 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006235 | Grad Max: 0.012460 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000296 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001657 | Grad Max: 0.003744 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001456 | Grad Max: 0.003073 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031024 | Grad Max: 0.031024 [GRADIENT NORM TOTAL] 9.7211 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.833 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008782 0.49912176] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.064 [MASKS] A(Pass/Fail): 702/1346 | B: 608/1440 | C: 304/1072 [LOSS Ex1] A: 0.65025 | B: 0.63790 | C: 0.63424 [LOGITS Ex2 A] Mean Abs: 1.990 | Max: 5.953 [LOSS Ex2] A: 0.12873 | B: 0.36407 | C: 0.26680 ** [JOINT LOSS] ** : 0.893999 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002768 | Grad Max: 0.067927 -> Layer: shared_layers.0.bias | Grad Mean: 0.103829 | Grad Max: 0.664709 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002076 | Grad Max: 0.005329 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001092 | Grad Max: 0.001092 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000781 | Grad Max: 0.185141 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013160 | Grad Max: 1.033513 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.002929 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002610 | Grad Max: 0.020000 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000386 | Grad Max: 0.002829 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000057 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000085 | Grad Max: 0.000562 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000389 | Grad Max: 0.001091 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000617 | Grad Max: 0.000617 [GRADIENT NORM TOTAL] 2.6177 [EPOCH SUMMARY] Train Loss: 0.8971 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8756 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 114/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.573 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6825978 0.31740215] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.065 [MASKS] A(Pass/Fail): 691/1357 | B: 611/1437 | C: 457/1591 [LOSS Ex1] A: 0.64642 | B: 0.63367 | C: 0.63519 [LOGITS Ex2 A] Mean Abs: 1.940 | Max: 5.853 [LOSS Ex2] A: 0.14793 | B: 0.35119 | C: 0.27447 ** [JOINT LOSS] ** : 0.896294 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008655 | Grad Max: 0.251226 -> Layer: shared_layers.0.bias | Grad Mean: 0.379058 | Grad Max: 1.588648 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.005948 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001507 | Grad Max: 0.001507 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002513 | Grad Max: 0.320286 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045768 | Grad Max: 1.779865 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000356 | Grad Max: 0.010454 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022539 | Grad Max: 0.110005 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000502 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005011 | Grad Max: 0.010297 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000270 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001370 | Grad Max: 0.003287 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001285 | Grad Max: 0.002612 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026945 | Grad Max: 0.026945 [GRADIENT NORM TOTAL] 7.3848 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.670 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6126545 0.38734546] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.065 [MASKS] A(Pass/Fail): 582/1034 | B: 571/1285 | C: 413/1635 [LOSS Ex1] A: 0.64476 | B: 0.63789 | C: 0.63649 [LOGITS Ex2 A] Mean Abs: 2.002 | Max: 7.265 [LOSS Ex2] A: 0.14059 | B: 0.33931 | C: 0.26484 ** [JOINT LOSS] ** : 0.887961 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006038 | Grad Max: 0.172470 -> Layer: shared_layers.0.bias | Grad Mean: 0.284050 | Grad Max: 1.180224 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.005739 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006720 | Grad Max: 0.006720 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001964 | Grad Max: 0.259527 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035655 | Grad Max: 1.451970 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.010425 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017957 | Grad Max: 0.102355 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000508 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003918 | Grad Max: 0.008982 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000192 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001056 | Grad Max: 0.002392 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001031 | Grad Max: 0.002272 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021315 | Grad Max: 0.021315 [GRADIENT NORM TOTAL] 5.7547 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.835 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50726825 0.49273178] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.066 [MASKS] A(Pass/Fail): 696/1352 | B: 615/1433 | C: 477/1571 [LOSS Ex1] A: 0.64531 | B: 0.63699 | C: 0.63047 [LOGITS Ex2 A] Mean Abs: 2.023 | Max: 7.386 [LOSS Ex2] A: 0.12978 | B: 0.36120 | C: 0.24533 ** [JOINT LOSS] ** : 0.883025 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003307 | Grad Max: 0.161938 -> Layer: shared_layers.0.bias | Grad Mean: 0.247138 | Grad Max: 1.166225 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.006060 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001360 | Grad Max: 0.001360 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001573 | Grad Max: 0.223916 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028143 | Grad Max: 1.238459 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000214 | Grad Max: 0.008642 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013714 | Grad Max: 0.085137 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000304 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002789 | Grad Max: 0.006193 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000141 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000754 | Grad Max: 0.001807 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000581 | Grad Max: 0.001899 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014070 | Grad Max: 0.014070 [GRADIENT NORM TOTAL] 5.0449 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.787 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5107768 0.4892232] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.065 [MASKS] A(Pass/Fail): 692/1356 | B: 608/1440 | C: 489/1559 [LOSS Ex1] A: 0.64245 | B: 0.63777 | C: 0.62623 [LOGITS Ex2 A] Mean Abs: 2.024 | Max: 6.085 [LOSS Ex2] A: 0.13435 | B: 0.35782 | C: 0.28383 ** [JOINT LOSS] ** : 0.894147 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004781 | Grad Max: 0.133480 -> Layer: shared_layers.0.bias | Grad Mean: 0.378912 | Grad Max: 1.497638 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002329 | Grad Max: 0.006681 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003655 | Grad Max: 0.003655 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002376 | Grad Max: 0.418860 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043881 | Grad Max: 2.335446 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000345 | Grad Max: 0.011071 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022407 | Grad Max: 0.111350 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000521 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004792 | Grad Max: 0.010877 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000230 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001295 | Grad Max: 0.003107 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001122 | Grad Max: 0.002440 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024895 | Grad Max: 0.024895 [GRADIENT NORM TOTAL] 7.6516 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.811 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50367737 0.49632263] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.065 [MASKS] A(Pass/Fail): 691/1357 | B: 611/1437 | C: 463/1585 [LOSS Ex1] A: 0.64147 | B: 0.63354 | C: 0.63022 [LOGITS Ex2 A] Mean Abs: 1.985 | Max: 7.819 [LOSS Ex2] A: 0.14757 | B: 0.33440 | C: 0.26849 ** [JOINT LOSS] ** : 0.885229 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002501 | Grad Max: 0.084452 -> Layer: shared_layers.0.bias | Grad Mean: 0.055461 | Grad Max: 0.244953 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002326 | Grad Max: 0.006418 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000127 | Grad Max: 0.000127 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000599 | Grad Max: 0.124550 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009602 | Grad Max: 0.704680 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.003098 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001918 | Grad Max: 0.016809 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000161 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000372 | Grad Max: 0.002084 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000065 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000112 | Grad Max: 0.000604 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000358 | Grad Max: 0.000976 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002036 | Grad Max: 0.002036 [GRADIENT NORM TOTAL] 1.9013 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.672 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026987 0.4973013] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.064 [MASKS] A(Pass/Fail): 667/1381 | B: 572/1284 | C: 473/1575 [LOSS Ex1] A: 0.64939 | B: 0.63775 | C: 0.63069 [LOGITS Ex2 A] Mean Abs: 1.956 | Max: 6.823 [LOSS Ex2] A: 0.13092 | B: 0.34824 | C: 0.24815 ** [JOINT LOSS] ** : 0.881712 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005183 | Grad Max: 0.148027 -> Layer: shared_layers.0.bias | Grad Mean: 0.467768 | Grad Max: 2.076950 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005816 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006372 | Grad Max: 0.006372 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003043 | Grad Max: 0.397593 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056806 | Grad Max: 2.237488 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000433 | Grad Max: 0.015979 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028327 | Grad Max: 0.171141 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000616 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006005 | Grad Max: 0.012262 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000315 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001579 | Grad Max: 0.003928 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001362 | Grad Max: 0.002883 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029185 | Grad Max: 0.029185 [GRADIENT NORM TOTAL] 9.8323 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.111 | Max: 0.598 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54243743 0.4575626 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.062 [MASKS] A(Pass/Fail): 666/1382 | B: 615/1433 | C: 456/1592 [LOSS Ex1] A: 0.64936 | B: 0.63684 | C: 0.63258 [LOGITS Ex2 A] Mean Abs: 1.936 | Max: 6.819 [LOSS Ex2] A: 0.14241 | B: 0.37504 | C: 0.27552 ** [JOINT LOSS] ** : 0.903919 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004304 | Grad Max: 0.133842 -> Layer: shared_layers.0.bias | Grad Mean: 0.386256 | Grad Max: 1.882718 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.005509 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004587 | Grad Max: 0.004587 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002404 | Grad Max: 0.366781 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044259 | Grad Max: 2.057684 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000313 | Grad Max: 0.011190 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020356 | Grad Max: 0.111854 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000416 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004356 | Grad Max: 0.009058 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000216 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001172 | Grad Max: 0.002795 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001059 | Grad Max: 0.002035 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023147 | Grad Max: 0.023147 [GRADIENT NORM TOTAL] 8.2988 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.754 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7304071 0.26959288] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.066 [MASKS] A(Pass/Fail): 719/1329 | B: 608/1440 | C: 491/1557 [LOSS Ex1] A: 0.64379 | B: 0.63761 | C: 0.63059 [LOGITS Ex2 A] Mean Abs: 2.006 | Max: 5.763 [LOSS Ex2] A: 0.12949 | B: 0.35836 | C: 0.27006 ** [JOINT LOSS] ** : 0.889966 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003635 | Grad Max: 0.102892 -> Layer: shared_layers.0.bias | Grad Mean: 0.171771 | Grad Max: 0.667385 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002207 | Grad Max: 0.006328 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002225 | Grad Max: 0.002225 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001259 | Grad Max: 0.182739 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022574 | Grad Max: 1.023749 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.007757 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011087 | Grad Max: 0.073465 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000294 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002419 | Grad Max: 0.006118 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000124 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000654 | Grad Max: 0.001517 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000519 | Grad Max: 0.001666 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011816 | Grad Max: 0.011816 [GRADIENT NORM TOTAL] 3.6973 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.838 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008451 0.49915493] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.065 [MASKS] A(Pass/Fail): 703/1345 | B: 611/1437 | C: 461/1587 [LOSS Ex1] A: 0.65006 | B: 0.63336 | C: 0.63172 [LOGITS Ex2 A] Mean Abs: 2.030 | Max: 6.010 [LOSS Ex2] A: 0.13002 | B: 0.33192 | C: 0.26190 ** [JOINT LOSS] ** : 0.879665 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002935 | Grad Max: 0.074390 -> Layer: shared_layers.0.bias | Grad Mean: 0.149697 | Grad Max: 0.736040 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.005560 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003566 | Grad Max: 0.003566 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001053 | Grad Max: 0.158942 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018699 | Grad Max: 0.883926 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.006179 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008715 | Grad Max: 0.059781 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000254 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001639 | Grad Max: 0.004773 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000104 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000443 | Grad Max: 0.001160 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000468 | Grad Max: 0.001659 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008496 | Grad Max: 0.008496 [GRADIENT NORM TOTAL] 3.1653 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.576 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6833938 0.3166062] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.065 [MASKS] A(Pass/Fail): 691/1357 | B: 572/1284 | C: 455/1593 [LOSS Ex1] A: 0.64622 | B: 0.63757 | C: 0.63371 [LOGITS Ex2 A] Mean Abs: 1.983 | Max: 5.801 [LOSS Ex2] A: 0.15207 | B: 0.34640 | C: 0.23757 ** [JOINT LOSS] ** : 0.884513 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004642 | Grad Max: 0.150627 -> Layer: shared_layers.0.bias | Grad Mean: 0.301995 | Grad Max: 1.116721 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002142 | Grad Max: 0.005898 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002200 | Grad Max: 0.002200 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001951 | Grad Max: 0.290062 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035239 | Grad Max: 1.625833 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.009889 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018197 | Grad Max: 0.095567 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000413 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003977 | Grad Max: 0.009211 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000211 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001055 | Grad Max: 0.002540 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000879 | Grad Max: 0.002051 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019273 | Grad Max: 0.019273 [GRADIENT NORM TOTAL] 5.9601 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.674 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6130964 0.3869036] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.066 [MASKS] A(Pass/Fail): 582/1034 | B: 615/1433 | C: 470/1578 [LOSS Ex1] A: 0.64455 | B: 0.63667 | C: 0.63090 [LOGITS Ex2 A] Mean Abs: 2.014 | Max: 5.684 [LOSS Ex2] A: 0.13471 | B: 0.37210 | C: 0.24716 ** [JOINT LOSS] ** : 0.888698 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002297 | Grad Max: 0.070394 -> Layer: shared_layers.0.bias | Grad Mean: 0.238231 | Grad Max: 1.012541 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.006579 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003793 | Grad Max: 0.003793 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001513 | Grad Max: 0.350475 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027763 | Grad Max: 1.975450 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.010046 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013548 | Grad Max: 0.084816 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000295 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002845 | Grad Max: 0.006547 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000163 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000765 | Grad Max: 0.002062 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000658 | Grad Max: 0.001666 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014685 | Grad Max: 0.014685 [GRADIENT NORM TOTAL] 5.2917 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.840 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50728023 0.4927198 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.066 [MASKS] A(Pass/Fail): 696/1352 | B: 608/1440 | C: 461/1587 [LOSS Ex1] A: 0.64510 | B: 0.63743 | C: 0.63274 [LOGITS Ex2 A] Mean Abs: 2.015 | Max: 7.032 [LOSS Ex2] A: 0.12976 | B: 0.35909 | C: 0.27371 ** [JOINT LOSS] ** : 0.892608 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003106 | Grad Max: 0.114926 -> Layer: shared_layers.0.bias | Grad Mean: 0.263704 | Grad Max: 1.504686 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.005520 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000385 | Grad Max: 0.000385 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001732 | Grad Max: 0.306102 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031979 | Grad Max: 1.728538 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000224 | Grad Max: 0.008031 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014628 | Grad Max: 0.077573 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000333 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003087 | Grad Max: 0.007631 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000151 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000786 | Grad Max: 0.001952 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000615 | Grad Max: 0.001640 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013534 | Grad Max: 0.013534 [GRADIENT NORM TOTAL] 6.0667 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.791 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5108119 0.48918808] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.066 [MASKS] A(Pass/Fail): 692/1356 | B: 611/1437 | C: 497/1551 [LOSS Ex1] A: 0.64222 | B: 0.63319 | C: 0.62824 [LOGITS Ex2 A] Mean Abs: 2.016 | Max: 6.486 [LOSS Ex2] A: 0.14039 | B: 0.33665 | C: 0.23899 ** [JOINT LOSS] ** : 0.873228 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004152 | Grad Max: 0.122086 -> Layer: shared_layers.0.bias | Grad Mean: 0.285836 | Grad Max: 1.574270 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002375 | Grad Max: 0.006127 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001407 | Grad Max: 0.001407 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001920 | Grad Max: 0.310324 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035286 | Grad Max: 1.727903 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000262 | Grad Max: 0.010727 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017228 | Grad Max: 0.107848 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000373 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003682 | Grad Max: 0.007959 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000182 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000978 | Grad Max: 0.002484 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000827 | Grad Max: 0.002242 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018566 | Grad Max: 0.018566 [GRADIENT NORM TOTAL] 6.4528 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.816 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5036692 0.4963308] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.065 [MASKS] A(Pass/Fail): 692/1356 | B: 572/1284 | C: 331/1045 [LOSS Ex1] A: 0.64123 | B: 0.63740 | C: 0.63029 [LOGITS Ex2 A] Mean Abs: 1.979 | Max: 6.902 [LOSS Ex2] A: 0.14902 | B: 0.34024 | C: 0.28270 ** [JOINT LOSS] ** : 0.893626 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003417 | Grad Max: 0.107495 -> Layer: shared_layers.0.bias | Grad Mean: 0.126889 | Grad Max: 0.712899 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002290 | Grad Max: 0.006376 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004461 | Grad Max: 0.004461 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000928 | Grad Max: 0.147504 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015242 | Grad Max: 0.778373 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.003567 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003399 | Grad Max: 0.033007 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000159 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000521 | Grad Max: 0.002464 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000057 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000136 | Grad Max: 0.000616 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000181 | Grad Max: 0.000762 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002574 | Grad Max: 0.002574 [GRADIENT NORM TOTAL] 2.8785 [EPOCH SUMMARY] Train Loss: 0.8882 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8707 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8710 -> New: 0.8707) ############################## EPOCH 115/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.676 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50261533 0.4973846 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.064 [MASKS] A(Pass/Fail): 667/1381 | B: 615/1433 | C: 467/1581 [LOSS Ex1] A: 0.64919 | B: 0.63650 | C: 0.63048 [LOGITS Ex2 A] Mean Abs: 1.934 | Max: 5.933 [LOSS Ex2] A: 0.13478 | B: 0.37024 | C: 0.25907 ** [JOINT LOSS] ** : 0.893422 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004475 | Grad Max: 0.103009 -> Layer: shared_layers.0.bias | Grad Mean: 0.255799 | Grad Max: 1.205365 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.005868 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000941 | Grad Max: 0.000941 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001815 | Grad Max: 0.309473 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033791 | Grad Max: 1.749633 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000268 | Grad Max: 0.008641 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017225 | Grad Max: 0.092026 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000362 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003676 | Grad Max: 0.008145 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000187 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000991 | Grad Max: 0.002320 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000934 | Grad Max: 0.002075 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019351 | Grad Max: 0.019351 [GRADIENT NORM TOTAL] 5.5997 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.602 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54257524 0.45742473] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.063 [MASKS] A(Pass/Fail): 666/1382 | B: 609/1439 | C: 451/1597 [LOSS Ex1] A: 0.64915 | B: 0.63726 | C: 0.63368 [LOGITS Ex2 A] Mean Abs: 1.949 | Max: 5.951 [LOSS Ex2] A: 0.14232 | B: 0.36852 | C: 0.26778 ** [JOINT LOSS] ** : 0.899576 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.063664 -> Layer: shared_layers.0.bias | Grad Mean: 0.142491 | Grad Max: 0.793353 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.006331 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010423 | Grad Max: 0.010423 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000946 | Grad Max: 0.218812 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016986 | Grad Max: 1.212684 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000098 | Grad Max: 0.006741 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006206 | Grad Max: 0.059539 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000195 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001214 | Grad Max: 0.003795 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000091 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000292 | Grad Max: 0.001013 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000237 | Grad Max: 0.000869 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003585 | Grad Max: 0.003585 [GRADIENT NORM TOTAL] 3.4889 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.759 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7316437 0.26835635] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.066 [MASKS] A(Pass/Fail): 720/1328 | B: 611/1437 | C: 500/1548 [LOSS Ex1] A: 0.64356 | B: 0.63301 | C: 0.62741 [LOGITS Ex2 A] Mean Abs: 1.990 | Max: 5.949 [LOSS Ex2] A: 0.12478 | B: 0.34607 | C: 0.25921 ** [JOINT LOSS] ** : 0.878014 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002464 | Grad Max: 0.077457 -> Layer: shared_layers.0.bias | Grad Mean: 0.140781 | Grad Max: 0.630270 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002364 | Grad Max: 0.006406 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006408 | Grad Max: 0.006408 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000941 | Grad Max: 0.183176 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016661 | Grad Max: 1.022259 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.005224 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005511 | Grad Max: 0.036755 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000219 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001228 | Grad Max: 0.003347 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000086 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000335 | Grad Max: 0.000917 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000415 | Grad Max: 0.001421 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006771 | Grad Max: 0.006771 [GRADIENT NORM TOTAL] 3.2763 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.843 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008713 0.49912864] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.065 [MASKS] A(Pass/Fail): 703/1345 | B: 573/1283 | C: 450/1598 [LOSS Ex1] A: 0.64985 | B: 0.63724 | C: 0.63160 [LOGITS Ex2 A] Mean Abs: 1.985 | Max: 5.588 [LOSS Ex2] A: 0.12021 | B: 0.33936 | C: 0.28324 ** [JOINT LOSS] ** : 0.887167 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003879 | Grad Max: 0.107966 -> Layer: shared_layers.0.bias | Grad Mean: 0.159343 | Grad Max: 0.787123 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005773 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002359 | Grad Max: 0.002359 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001210 | Grad Max: 0.219242 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021908 | Grad Max: 1.223605 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000157 | Grad Max: 0.006364 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009999 | Grad Max: 0.050697 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000265 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002228 | Grad Max: 0.005546 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000150 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000604 | Grad Max: 0.001800 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000565 | Grad Max: 0.001609 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011735 | Grad Max: 0.011735 [GRADIENT NORM TOTAL] 3.5921 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.580 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6842632 0.31573677] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.066 [MASKS] A(Pass/Fail): 690/1358 | B: 616/1432 | C: 475/1573 [LOSS Ex1] A: 0.64598 | B: 0.63634 | C: 0.62811 [LOGITS Ex2 A] Mean Abs: 1.991 | Max: 5.813 [LOSS Ex2] A: 0.14317 | B: 0.36348 | C: 0.23737 ** [JOINT LOSS] ** : 0.884815 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002515 | Grad Max: 0.043374 -> Layer: shared_layers.0.bias | Grad Mean: 0.106495 | Grad Max: 0.529414 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002226 | Grad Max: 0.006004 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001035 | Grad Max: 0.001035 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000678 | Grad Max: 0.264565 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011676 | Grad Max: 1.496614 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.002851 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002818 | Grad Max: 0.024958 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000148 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000562 | Grad Max: 0.002527 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000092 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000142 | Grad Max: 0.000761 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000409 | Grad Max: 0.001052 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001728 | Grad Max: 0.001728 [GRADIENT NORM TOTAL] 3.0609 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.678 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6135699 0.38643003] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.066 [MASKS] A(Pass/Fail): 581/1035 | B: 610/1438 | C: 439/1609 [LOSS Ex1] A: 0.64430 | B: 0.63709 | C: 0.63439 [LOGITS Ex2 A] Mean Abs: 2.043 | Max: 6.362 [LOSS Ex2] A: 0.13377 | B: 0.36514 | C: 0.28525 ** [JOINT LOSS] ** : 0.899981 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005249 | Grad Max: 0.116040 -> Layer: shared_layers.0.bias | Grad Mean: 0.330195 | Grad Max: 1.472547 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002143 | Grad Max: 0.006170 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000818 | Grad Max: 0.000818 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.285612 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040109 | Grad Max: 1.603897 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.011988 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019549 | Grad Max: 0.101784 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000412 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004136 | Grad Max: 0.008770 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000197 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001086 | Grad Max: 0.002662 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000919 | Grad Max: 0.002114 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019603 | Grad Max: 0.019603 [GRADIENT NORM TOTAL] 6.7389 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.845 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50727564 0.4927244 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.066 [MASKS] A(Pass/Fail): 698/1350 | B: 612/1436 | C: 455/1593 [LOSS Ex1] A: 0.64486 | B: 0.63284 | C: 0.63100 [LOGITS Ex2 A] Mean Abs: 2.017 | Max: 6.661 [LOSS Ex2] A: 0.13678 | B: 0.34742 | C: 0.26300 ** [JOINT LOSS] ** : 0.885298 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002486 | Grad Max: 0.052567 -> Layer: shared_layers.0.bias | Grad Mean: 0.132829 | Grad Max: 0.657448 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002282 | Grad Max: 0.005642 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001575 | Grad Max: 0.001575 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000984 | Grad Max: 0.202102 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017770 | Grad Max: 1.134006 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000121 | Grad Max: 0.004590 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007762 | Grad Max: 0.040765 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000228 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001631 | Grad Max: 0.004946 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000097 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000430 | Grad Max: 0.001075 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000390 | Grad Max: 0.001293 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007625 | Grad Max: 0.007625 [GRADIENT NORM TOTAL] 3.1927 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.797 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5107413 0.48925874] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.066 [MASKS] A(Pass/Fail): 692/1356 | B: 573/1283 | C: 460/1588 [LOSS Ex1] A: 0.64197 | B: 0.63706 | C: 0.63263 [LOGITS Ex2 A] Mean Abs: 2.007 | Max: 6.503 [LOSS Ex2] A: 0.13152 | B: 0.35843 | C: 0.26367 ** [JOINT LOSS] ** : 0.888429 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002977 | Grad Max: 0.155390 -> Layer: shared_layers.0.bias | Grad Mean: 0.438176 | Grad Max: 2.079232 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006721 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004190 | Grad Max: 0.004190 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002667 | Grad Max: 0.287820 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049626 | Grad Max: 1.617141 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.016836 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025879 | Grad Max: 0.151395 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000458 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005340 | Grad Max: 0.010700 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000244 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001414 | Grad Max: 0.003154 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001109 | Grad Max: 0.002434 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025984 | Grad Max: 0.025984 [GRADIENT NORM TOTAL] 8.9674 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.822 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.503761 0.49623898] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.066 [MASKS] A(Pass/Fail): 692/1356 | B: 616/1432 | C: 492/1556 [LOSS Ex1] A: 0.64097 | B: 0.63616 | C: 0.62887 [LOGITS Ex2 A] Mean Abs: 1.971 | Max: 6.370 [LOSS Ex2] A: 0.15419 | B: 0.37670 | C: 0.27775 ** [JOINT LOSS] ** : 0.904882 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004731 | Grad Max: 0.173977 -> Layer: shared_layers.0.bias | Grad Mean: 0.513686 | Grad Max: 2.363801 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002248 | Grad Max: 0.006473 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005538 | Grad Max: 0.005538 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003223 | Grad Max: 0.387827 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060068 | Grad Max: 2.162498 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000464 | Grad Max: 0.015960 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030785 | Grad Max: 0.169271 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000577 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006338 | Grad Max: 0.012421 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000289 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001687 | Grad Max: 0.003936 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001421 | Grad Max: 0.002646 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032210 | Grad Max: 0.032210 [GRADIENT NORM TOTAL] 10.5389 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.681 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5026096 0.4973904] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.065 [MASKS] A(Pass/Fail): 667/1381 | B: 610/1438 | C: 465/1583 [LOSS Ex1] A: 0.64895 | B: 0.63691 | C: 0.63146 [LOGITS Ex2 A] Mean Abs: 1.989 | Max: 6.494 [LOSS Ex2] A: 0.13473 | B: 0.35766 | C: 0.23608 ** [JOINT LOSS] ** : 0.881932 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.052019 -> Layer: shared_layers.0.bias | Grad Mean: 0.085002 | Grad Max: 0.423141 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.005569 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005104 | Grad Max: 0.005104 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000668 | Grad Max: 0.152737 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011485 | Grad Max: 0.859001 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.003024 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002015 | Grad Max: 0.020445 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000140 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000374 | Grad Max: 0.002234 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000057 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000094 | Grad Max: 0.000538 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000449 | Grad Max: 0.001196 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001049 | Grad Max: 0.001049 [GRADIENT NORM TOTAL] 2.4969 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.606 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5425833 0.4574167] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.063 [MASKS] A(Pass/Fail): 666/1382 | B: 612/1436 | C: 472/1576 [LOSS Ex1] A: 0.64892 | B: 0.63265 | C: 0.63235 [LOGITS Ex2 A] Mean Abs: 2.009 | Max: 6.570 [LOSS Ex2] A: 0.14367 | B: 0.34499 | C: 0.25635 ** [JOINT LOSS] ** : 0.886313 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005767 | Grad Max: 0.154884 -> Layer: shared_layers.0.bias | Grad Mean: 0.413429 | Grad Max: 1.972119 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.006595 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012174 | Grad Max: 0.012174 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002676 | Grad Max: 0.442663 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050079 | Grad Max: 2.480869 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000382 | Grad Max: 0.013943 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025047 | Grad Max: 0.145791 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000511 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005256 | Grad Max: 0.011051 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000255 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001388 | Grad Max: 0.003275 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001194 | Grad Max: 0.002699 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026494 | Grad Max: 0.026494 [GRADIENT NORM TOTAL] 8.8657 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.764 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.73289424 0.26710573] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.066 [MASKS] A(Pass/Fail): 721/1327 | B: 573/1283 | C: 456/1592 [LOSS Ex1] A: 0.64331 | B: 0.63688 | C: 0.63417 [LOGITS Ex2 A] Mean Abs: 2.036 | Max: 6.124 [LOSS Ex2] A: 0.14465 | B: 0.33597 | C: 0.26662 ** [JOINT LOSS] ** : 0.887198 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007653 | Grad Max: 0.212161 -> Layer: shared_layers.0.bias | Grad Mean: 0.410863 | Grad Max: 1.542050 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.006063 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005065 | Grad Max: 0.005065 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002699 | Grad Max: 0.382969 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049674 | Grad Max: 2.132585 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000385 | Grad Max: 0.012168 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024802 | Grad Max: 0.132445 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000495 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005315 | Grad Max: 0.010985 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000260 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001407 | Grad Max: 0.003356 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001254 | Grad Max: 0.002632 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026988 | Grad Max: 0.026988 [GRADIENT NORM TOTAL] 8.2325 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.849 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008316 0.4991684] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.065 [MASKS] A(Pass/Fail): 704/1344 | B: 615/1433 | C: 512/1536 [LOSS Ex1] A: 0.64962 | B: 0.63598 | C: 0.62640 [LOGITS Ex2 A] Mean Abs: 2.028 | Max: 5.770 [LOSS Ex2] A: 0.12182 | B: 0.36233 | C: 0.25004 ** [JOINT LOSS] ** : 0.882062 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002433 | Grad Max: 0.074010 -> Layer: shared_layers.0.bias | Grad Mean: 0.135247 | Grad Max: 0.595598 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.006088 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002839 | Grad Max: 0.002839 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000837 | Grad Max: 0.446815 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014504 | Grad Max: 2.480123 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.004337 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004479 | Grad Max: 0.036868 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000212 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001027 | Grad Max: 0.003508 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000101 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000281 | Grad Max: 0.000982 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000373 | Grad Max: 0.001182 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006165 | Grad Max: 0.006165 [GRADIENT NORM TOTAL] 4.0189 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.585 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.68518806 0.31481192] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.066 [MASKS] A(Pass/Fail): 690/1358 | B: 610/1438 | C: 309/1067 [LOSS Ex1] A: 0.64574 | B: 0.63673 | C: 0.62918 [LOGITS Ex2 A] Mean Abs: 1.995 | Max: 6.400 [LOSS Ex2] A: 0.14671 | B: 0.36555 | C: 0.26040 ** [JOINT LOSS] ** : 0.894771 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007010 | Grad Max: 0.205263 -> Layer: shared_layers.0.bias | Grad Mean: 0.302700 | Grad Max: 1.328791 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.005736 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006249 | Grad Max: 0.006249 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.346935 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037540 | Grad Max: 1.962603 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000299 | Grad Max: 0.010264 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019079 | Grad Max: 0.105194 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000442 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004112 | Grad Max: 0.008395 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000249 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001086 | Grad Max: 0.002585 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001024 | Grad Max: 0.002113 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020763 | Grad Max: 0.020763 [GRADIENT NORM TOTAL] 6.2918 [EPOCH SUMMARY] Train Loss: 0.8896 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8685 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8707 -> New: 0.8685) ############################## EPOCH 116/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.683 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6141272 0.38587278] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.066 [MASKS] A(Pass/Fail): 582/1034 | B: 612/1436 | C: 469/1579 [LOSS Ex1] A: 0.64406 | B: 0.63247 | C: 0.63019 [LOGITS Ex2 A] Mean Abs: 2.039 | Max: 6.828 [LOSS Ex2] A: 0.13485 | B: 0.34244 | C: 0.26914 ** [JOINT LOSS] ** : 0.884382 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004007 | Grad Max: 0.133813 -> Layer: shared_layers.0.bias | Grad Mean: 0.102221 | Grad Max: 0.472252 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002223 | Grad Max: 0.006191 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000854 | Grad Max: 0.000854 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000788 | Grad Max: 0.241629 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013871 | Grad Max: 1.341937 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.004018 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005068 | Grad Max: 0.030859 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000211 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001109 | Grad Max: 0.003285 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000307 | Grad Max: 0.000979 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000352 | Grad Max: 0.001224 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006850 | Grad Max: 0.006850 [GRADIENT NORM TOTAL] 2.7974 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.851 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072864 0.49271357] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.067 [MASKS] A(Pass/Fail): 700/1348 | B: 573/1283 | C: 463/1585 [LOSS Ex1] A: 0.64462 | B: 0.63670 | C: 0.63211 [LOGITS Ex2 A] Mean Abs: 2.046 | Max: 7.474 [LOSS Ex2] A: 0.12980 | B: 0.34813 | C: 0.26650 ** [JOINT LOSS] ** : 0.885956 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004396 | Grad Max: 0.157146 -> Layer: shared_layers.0.bias | Grad Mean: 0.380320 | Grad Max: 2.015921 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.005996 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001186 | Grad Max: 0.001186 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002494 | Grad Max: 0.417714 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045625 | Grad Max: 2.325788 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000342 | Grad Max: 0.012854 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022416 | Grad Max: 0.131672 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000474 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004773 | Grad Max: 0.010799 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000222 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001256 | Grad Max: 0.002958 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001015 | Grad Max: 0.002400 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022913 | Grad Max: 0.022913 [GRADIENT NORM TOTAL] 8.3392 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.802 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51072824 0.48927176] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.066 [MASKS] A(Pass/Fail): 692/1356 | B: 615/1433 | C: 443/1605 [LOSS Ex1] A: 0.64172 | B: 0.63581 | C: 0.63101 [LOGITS Ex2 A] Mean Abs: 2.055 | Max: 6.146 [LOSS Ex2] A: 0.13592 | B: 0.35747 | C: 0.26489 ** [JOINT LOSS] ** : 0.888942 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005166 | Grad Max: 0.150665 -> Layer: shared_layers.0.bias | Grad Mean: 0.339502 | Grad Max: 1.674165 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006531 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007798 | Grad Max: 0.007798 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002279 | Grad Max: 0.315985 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041994 | Grad Max: 1.763909 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000323 | Grad Max: 0.011845 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021050 | Grad Max: 0.109777 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000505 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004479 | Grad Max: 0.010217 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000217 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001160 | Grad Max: 0.002821 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000968 | Grad Max: 0.002253 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020991 | Grad Max: 0.020991 [GRADIENT NORM TOTAL] 7.2109 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.827 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50382143 0.49617857] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.066 [MASKS] A(Pass/Fail): 692/1356 | B: 610/1438 | C: 482/1566 [LOSS Ex1] A: 0.64072 | B: 0.63656 | C: 0.62889 [LOGITS Ex2 A] Mean Abs: 1.996 | Max: 7.559 [LOSS Ex2] A: 0.15138 | B: 0.35887 | C: 0.25406 ** [JOINT LOSS] ** : 0.890160 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002928 | Grad Max: 0.119263 -> Layer: shared_layers.0.bias | Grad Mean: 0.061432 | Grad Max: 0.239146 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002248 | Grad Max: 0.006712 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000249 | Grad Max: 0.000249 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000732 | Grad Max: 0.153026 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011756 | Grad Max: 0.843758 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.003152 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002180 | Grad Max: 0.016998 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000123 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000289 | Grad Max: 0.001920 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000049 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000075 | Grad Max: 0.000431 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000292 | Grad Max: 0.000703 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000232 | Grad Max: 0.000232 [GRADIENT NORM TOTAL] 2.2881 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.685 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025766 0.4974234] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.065 [MASKS] A(Pass/Fail): 667/1381 | B: 614/1434 | C: 507/1541 [LOSS Ex1] A: 0.64872 | B: 0.63229 | C: 0.62529 [LOGITS Ex2 A] Mean Abs: 1.962 | Max: 5.880 [LOSS Ex2] A: 0.13275 | B: 0.34854 | C: 0.25407 ** [JOINT LOSS] ** : 0.880557 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005037 | Grad Max: 0.122446 -> Layer: shared_layers.0.bias | Grad Mean: 0.377017 | Grad Max: 1.465725 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.005875 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003805 | Grad Max: 0.003805 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002440 | Grad Max: 0.275116 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045448 | Grad Max: 1.553258 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000374 | Grad Max: 0.013870 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024626 | Grad Max: 0.145578 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000521 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005166 | Grad Max: 0.011199 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000273 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001326 | Grad Max: 0.003239 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001112 | Grad Max: 0.002562 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023929 | Grad Max: 0.023929 [GRADIENT NORM TOTAL] 7.4989 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.112 | Max: 0.610 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54272366 0.45727637] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.550 | Std: 0.063 [MASKS] A(Pass/Fail): 666/1382 | B: 574/1282 | C: 495/1553 [LOSS Ex1] A: 0.64870 | B: 0.63653 | C: 0.63062 [LOGITS Ex2 A] Mean Abs: 1.964 | Max: 5.879 [LOSS Ex2] A: 0.13776 | B: 0.34298 | C: 0.28201 ** [JOINT LOSS] ** : 0.892868 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002984 | Grad Max: 0.096877 -> Layer: shared_layers.0.bias | Grad Mean: 0.198097 | Grad Max: 0.638449 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.005928 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007559 | Grad Max: 0.007559 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001289 | Grad Max: 0.133694 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023195 | Grad Max: 0.748247 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000177 | Grad Max: 0.006317 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011480 | Grad Max: 0.053745 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000264 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002502 | Grad Max: 0.005828 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000163 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000670 | Grad Max: 0.001893 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000623 | Grad Max: 0.001520 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013405 | Grad Max: 0.013405 [GRADIENT NORM TOTAL] 3.9077 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.770 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.73423314 0.2657668 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.067 [MASKS] A(Pass/Fail): 722/1326 | B: 615/1433 | C: 454/1594 [LOSS Ex1] A: 0.64306 | B: 0.63563 | C: 0.63142 [LOGITS Ex2 A] Mean Abs: 2.048 | Max: 6.431 [LOSS Ex2] A: 0.12726 | B: 0.35988 | C: 0.26893 ** [JOINT LOSS] ** : 0.888725 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005916 | Grad Max: 0.147009 -> Layer: shared_layers.0.bias | Grad Mean: 0.415086 | Grad Max: 1.792737 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.006297 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007258 | Grad Max: 0.007258 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002714 | Grad Max: 0.290657 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050576 | Grad Max: 1.635965 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.013188 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026060 | Grad Max: 0.135441 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000536 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005536 | Grad Max: 0.011528 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000239 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001458 | Grad Max: 0.003312 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001239 | Grad Max: 0.002708 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027399 | Grad Max: 0.027399 [GRADIENT NORM TOTAL] 8.3678 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.855 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008498 0.49915025] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.066 [MASKS] A(Pass/Fail): 704/1344 | B: 610/1438 | C: 461/1587 [LOSS Ex1] A: 0.64940 | B: 0.63638 | C: 0.62761 [LOGITS Ex2 A] Mean Abs: 2.071 | Max: 6.098 [LOSS Ex2] A: 0.13511 | B: 0.36397 | C: 0.25306 ** [JOINT LOSS] ** : 0.888512 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005464 | Grad Max: 0.205028 -> Layer: shared_layers.0.bias | Grad Mean: 0.567176 | Grad Max: 2.585366 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005530 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003574 | Grad Max: 0.003574 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003530 | Grad Max: 0.457706 -> Layer: exit2_layers.0.bias | Grad Mean: 0.066178 | Grad Max: 2.561552 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000531 | Grad Max: 0.016744 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035250 | Grad Max: 0.183535 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000623 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007212 | Grad Max: 0.014050 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000317 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001909 | Grad Max: 0.004155 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001572 | Grad Max: 0.003360 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035870 | Grad Max: 0.035870 [GRADIENT NORM TOTAL] 11.6550 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.589 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.68620825 0.31379178] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.066 [MASKS] A(Pass/Fail): 690/1358 | B: 614/1434 | C: 477/1571 [LOSS Ex1] A: 0.64550 | B: 0.63211 | C: 0.62830 [LOGITS Ex2 A] Mean Abs: 2.023 | Max: 6.050 [LOSS Ex2] A: 0.14664 | B: 0.34128 | C: 0.25888 ** [JOINT LOSS] ** : 0.884236 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002573 | Grad Max: 0.066361 -> Layer: shared_layers.0.bias | Grad Mean: 0.201339 | Grad Max: 0.817155 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.005795 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001832 | Grad Max: 0.001832 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001553 | Grad Max: 0.280590 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028158 | Grad Max: 1.570699 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000214 | Grad Max: 0.008475 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014000 | Grad Max: 0.088571 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000322 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002760 | Grad Max: 0.006329 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000151 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000728 | Grad Max: 0.001704 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000599 | Grad Max: 0.001548 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013530 | Grad Max: 0.013530 [GRADIENT NORM TOTAL] 4.8240 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.687 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6146742 0.38532573] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.067 [MASKS] A(Pass/Fail): 582/1034 | B: 575/1281 | C: 466/1582 [LOSS Ex1] A: 0.64381 | B: 0.63635 | C: 0.63107 [LOGITS Ex2 A] Mean Abs: 2.016 | Max: 5.822 [LOSS Ex2] A: 0.13664 | B: 0.35931 | C: 0.26685 ** [JOINT LOSS] ** : 0.891347 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007634 | Grad Max: 0.199615 -> Layer: shared_layers.0.bias | Grad Mean: 0.588634 | Grad Max: 2.671726 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.006157 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003764 | Grad Max: 0.003764 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003843 | Grad Max: 0.605954 -> Layer: exit2_layers.0.bias | Grad Mean: 0.071244 | Grad Max: 3.378571 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000555 | Grad Max: 0.017299 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036874 | Grad Max: 0.183848 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000716 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007864 | Grad Max: 0.015887 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000372 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002091 | Grad Max: 0.005027 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001829 | Grad Max: 0.003236 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039769 | Grad Max: 0.039769 [GRADIENT NORM TOTAL] 12.3238 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.856 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073193 0.4926808] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.067 [MASKS] A(Pass/Fail): 701/1347 | B: 615/1433 | C: 473/1575 [LOSS Ex1] A: 0.64438 | B: 0.63547 | C: 0.63321 [LOGITS Ex2 A] Mean Abs: 1.990 | Max: 6.689 [LOSS Ex2] A: 0.13850 | B: 0.40267 | C: 0.27725 ** [JOINT LOSS] ** : 0.910496 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010120 | Grad Max: 0.257573 -> Layer: shared_layers.0.bias | Grad Mean: 0.769439 | Grad Max: 3.486998 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.005918 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002973 | Grad Max: 0.002973 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005087 | Grad Max: 0.727261 -> Layer: exit2_layers.0.bias | Grad Mean: 0.094249 | Grad Max: 4.054877 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000722 | Grad Max: 0.024820 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047819 | Grad Max: 0.254872 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.000855 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010222 | Grad Max: 0.020083 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000466 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002731 | Grad Max: 0.006200 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002325 | Grad Max: 0.004519 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052133 | Grad Max: 0.052133 [GRADIENT NORM TOTAL] 16.1813 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.806 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51074547 0.48925453] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.067 [MASKS] A(Pass/Fail): 692/1356 | B: 610/1438 | C: 444/1604 [LOSS Ex1] A: 0.64147 | B: 0.63622 | C: 0.63512 [LOGITS Ex2 A] Mean Abs: 1.989 | Max: 5.797 [LOSS Ex2] A: 0.13208 | B: 0.37505 | C: 0.25319 ** [JOINT LOSS] ** : 0.891042 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004945 | Grad Max: 0.157434 -> Layer: shared_layers.0.bias | Grad Mean: 0.473878 | Grad Max: 2.211975 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002293 | Grad Max: 0.006511 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009663 | Grad Max: 0.009663 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003040 | Grad Max: 0.387090 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056335 | Grad Max: 2.180807 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000436 | Grad Max: 0.016591 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029140 | Grad Max: 0.165657 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000545 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006173 | Grad Max: 0.012750 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000284 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001636 | Grad Max: 0.003873 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001358 | Grad Max: 0.002650 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030706 | Grad Max: 0.030706 [GRADIENT NORM TOTAL] 9.8081 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.831 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5038373 0.4961627] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.066 [MASKS] A(Pass/Fail): 694/1354 | B: 615/1433 | C: 482/1566 [LOSS Ex1] A: 0.64048 | B: 0.63196 | C: 0.62631 [LOGITS Ex2 A] Mean Abs: 2.021 | Max: 6.375 [LOSS Ex2] A: 0.15163 | B: 0.33841 | C: 0.23733 ** [JOINT LOSS] ** : 0.875368 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003710 | Grad Max: 0.115586 -> Layer: shared_layers.0.bias | Grad Mean: 0.145722 | Grad Max: 0.542652 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002390 | Grad Max: 0.006779 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005016 | Grad Max: 0.005016 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001116 | Grad Max: 0.241607 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020066 | Grad Max: 1.353856 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000113 | Grad Max: 0.004117 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007195 | Grad Max: 0.034037 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000233 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001614 | Grad Max: 0.004495 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000106 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000436 | Grad Max: 0.001293 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000425 | Grad Max: 0.001623 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008475 | Grad Max: 0.008475 [GRADIENT NORM TOTAL] 3.7821 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.689 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50253856 0.49746147] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.551 | Std: 0.065 [MASKS] A(Pass/Fail): 668/1380 | B: 576/1280 | C: 306/1070 [LOSS Ex1] A: 0.64851 | B: 0.63621 | C: 0.63379 [LOGITS Ex2 A] Mean Abs: 2.001 | Max: 6.182 [LOSS Ex2] A: 0.13341 | B: 0.34026 | C: 0.23674 ** [JOINT LOSS] ** : 0.876301 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005787 | Grad Max: 0.177016 -> Layer: shared_layers.0.bias | Grad Mean: 0.406778 | Grad Max: 2.024339 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.005877 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001289 | Grad Max: 0.001289 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002616 | Grad Max: 0.427551 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048767 | Grad Max: 2.383898 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000355 | Grad Max: 0.011562 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023472 | Grad Max: 0.117552 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000498 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005018 | Grad Max: 0.010435 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000216 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001376 | Grad Max: 0.002896 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001137 | Grad Max: 0.002998 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026785 | Grad Max: 0.026785 [GRADIENT NORM TOTAL] 8.8666 [EPOCH SUMMARY] Train Loss: 0.8878 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8693 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 117/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.614 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54280216 0.45719787] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.063 [MASKS] A(Pass/Fail): 667/1381 | B: 615/1433 | C: 462/1586 [LOSS Ex1] A: 0.64850 | B: 0.63533 | C: 0.63064 [LOGITS Ex2 A] Mean Abs: 1.973 | Max: 5.747 [LOSS Ex2] A: 0.13569 | B: 0.35807 | C: 0.27798 ** [JOINT LOSS] ** : 0.895401 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003310 | Grad Max: 0.108033 -> Layer: shared_layers.0.bias | Grad Mean: 0.282359 | Grad Max: 1.420349 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.006036 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008490 | Grad Max: 0.008490 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001822 | Grad Max: 0.312554 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033818 | Grad Max: 1.746763 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000240 | Grad Max: 0.009457 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015985 | Grad Max: 0.083717 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000344 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003330 | Grad Max: 0.007642 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000150 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000864 | Grad Max: 0.002054 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000667 | Grad Max: 0.001721 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015518 | Grad Max: 0.015518 [GRADIENT NORM TOTAL] 6.3170 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.774 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.73535657 0.26464346] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.067 [MASKS] A(Pass/Fail): 722/1326 | B: 610/1438 | C: 462/1586 [LOSS Ex1] A: 0.64284 | B: 0.63608 | C: 0.63584 [LOGITS Ex2 A] Mean Abs: 1.981 | Max: 5.957 [LOSS Ex2] A: 0.12397 | B: 0.36228 | C: 0.27396 ** [JOINT LOSS] ** : 0.891661 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002696 | Grad Max: 0.084339 -> Layer: shared_layers.0.bias | Grad Mean: 0.182847 | Grad Max: 0.794398 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002224 | Grad Max: 0.006110 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007461 | Grad Max: 0.007461 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001196 | Grad Max: 0.201089 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021676 | Grad Max: 1.110722 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000160 | Grad Max: 0.006746 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010575 | Grad Max: 0.064636 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000258 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002123 | Grad Max: 0.005180 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000119 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000577 | Grad Max: 0.001523 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000481 | Grad Max: 0.001477 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011240 | Grad Max: 0.011240 [GRADIENT NORM TOTAL] 3.8416 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.860 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008404 0.49915963] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.066 [MASKS] A(Pass/Fail): 705/1343 | B: 615/1433 | C: 467/1581 [LOSS Ex1] A: 0.64920 | B: 0.63182 | C: 0.63024 [LOGITS Ex2 A] Mean Abs: 1.998 | Max: 5.824 [LOSS Ex2] A: 0.12094 | B: 0.34612 | C: 0.23995 ** [JOINT LOSS] ** : 0.872755 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004689 | Grad Max: 0.144382 -> Layer: shared_layers.0.bias | Grad Mean: 0.251673 | Grad Max: 0.886535 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.005803 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005695 | Grad Max: 0.005695 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001749 | Grad Max: 0.245642 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032250 | Grad Max: 1.346692 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000235 | Grad Max: 0.008237 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015523 | Grad Max: 0.091043 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000367 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003330 | Grad Max: 0.007113 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000181 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000876 | Grad Max: 0.002194 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000778 | Grad Max: 0.002073 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016820 | Grad Max: 0.016820 [GRADIENT NORM TOTAL] 5.3683 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.593 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6869606 0.31303945] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.067 [MASKS] A(Pass/Fail): 690/1358 | B: 577/1279 | C: 456/1592 [LOSS Ex1] A: 0.64529 | B: 0.63607 | C: 0.62849 [LOGITS Ex2 A] Mean Abs: 1.995 | Max: 5.849 [LOSS Ex2] A: 0.14597 | B: 0.33664 | C: 0.24967 ** [JOINT LOSS] ** : 0.880711 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002596 | Grad Max: 0.060273 -> Layer: shared_layers.0.bias | Grad Mean: 0.111215 | Grad Max: 0.517684 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.006069 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000991 | Grad Max: 0.000991 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000742 | Grad Max: 0.141450 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012202 | Grad Max: 0.749668 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000066 | Grad Max: 0.004446 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003730 | Grad Max: 0.033703 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000180 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000742 | Grad Max: 0.003089 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000072 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000215 | Grad Max: 0.000847 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000371 | Grad Max: 0.001234 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005572 | Grad Max: 0.005572 [GRADIENT NORM TOTAL] 2.5386 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.691 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6150381 0.38496187] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.067 [MASKS] A(Pass/Fail): 582/1034 | B: 615/1433 | C: 453/1595 [LOSS Ex1] A: 0.64360 | B: 0.63519 | C: 0.63253 [LOGITS Ex2 A] Mean Abs: 2.063 | Max: 6.850 [LOSS Ex2] A: 0.12162 | B: 0.35808 | C: 0.25980 ** [JOINT LOSS] ** : 0.883605 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002691 | Grad Max: 0.083468 -> Layer: shared_layers.0.bias | Grad Mean: 0.238993 | Grad Max: 1.060120 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.006386 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003840 | Grad Max: 0.003840 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001528 | Grad Max: 0.197251 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028159 | Grad Max: 1.091436 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000215 | Grad Max: 0.009704 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014285 | Grad Max: 0.087187 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000305 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002985 | Grad Max: 0.006870 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000158 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000769 | Grad Max: 0.001986 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000554 | Grad Max: 0.001848 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012782 | Grad Max: 0.012782 [GRADIENT NORM TOTAL] 4.9117 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.861 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073497 0.4926502] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.067 [MASKS] A(Pass/Fail): 701/1347 | B: 613/1435 | C: 488/1560 [LOSS Ex1] A: 0.64417 | B: 0.63594 | C: 0.62781 [LOGITS Ex2 A] Mean Abs: 2.034 | Max: 8.542 [LOSS Ex2] A: 0.12539 | B: 0.35052 | C: 0.26361 ** [JOINT LOSS] ** : 0.882481 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003037 | Grad Max: 0.144780 -> Layer: shared_layers.0.bias | Grad Mean: 0.167506 | Grad Max: 0.814997 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.005727 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003009 | Grad Max: 0.003009 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000942 | Grad Max: 0.421650 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016940 | Grad Max: 2.354693 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.004760 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004847 | Grad Max: 0.043996 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000203 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001002 | Grad Max: 0.003193 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000081 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000254 | Grad Max: 0.000807 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000356 | Grad Max: 0.001236 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004505 | Grad Max: 0.004505 [GRADIENT NORM TOTAL] 4.5483 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.811 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51066923 0.4893308 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.067 [MASKS] A(Pass/Fail): 692/1356 | B: 615/1433 | C: 510/1538 [LOSS Ex1] A: 0.64125 | B: 0.63167 | C: 0.62555 [LOGITS Ex2 A] Mean Abs: 2.017 | Max: 6.240 [LOSS Ex2] A: 0.12901 | B: 0.34481 | C: 0.26554 ** [JOINT LOSS] ** : 0.879275 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002009 | Grad Max: 0.055756 -> Layer: shared_layers.0.bias | Grad Mean: 0.133702 | Grad Max: 0.787369 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002467 | Grad Max: 0.006466 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008749 | Grad Max: 0.008749 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001036 | Grad Max: 0.251035 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018543 | Grad Max: 1.386706 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000096 | Grad Max: 0.004232 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006091 | Grad Max: 0.036515 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000182 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001238 | Grad Max: 0.003796 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000098 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000329 | Grad Max: 0.001058 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.001167 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006092 | Grad Max: 0.006092 [GRADIENT NORM TOTAL] 3.9182 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.836 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50393134 0.49606866] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.067 [MASKS] A(Pass/Fail): 694/1354 | B: 577/1279 | C: 469/1579 [LOSS Ex1] A: 0.64026 | B: 0.63591 | C: 0.62873 [LOGITS Ex2 A] Mean Abs: 1.995 | Max: 6.157 [LOSS Ex2] A: 0.15203 | B: 0.33951 | C: 0.23877 ** [JOINT LOSS] ** : 0.878406 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003813 | Grad Max: 0.158306 -> Layer: shared_layers.0.bias | Grad Mean: 0.081935 | Grad Max: 0.329071 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002280 | Grad Max: 0.006153 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002334 | Grad Max: 0.002334 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000770 | Grad Max: 0.182728 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011900 | Grad Max: 1.012790 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.003810 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002323 | Grad Max: 0.026332 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000161 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000469 | Grad Max: 0.002983 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000069 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000129 | Grad Max: 0.000605 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000378 | Grad Max: 0.001093 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001723 | Grad Max: 0.001723 [GRADIENT NORM TOTAL] 2.2884 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.693 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025383 0.49746168] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.066 [MASKS] A(Pass/Fail): 668/1380 | B: 615/1433 | C: 481/1567 [LOSS Ex1] A: 0.64830 | B: 0.63502 | C: 0.62874 [LOGITS Ex2 A] Mean Abs: 1.984 | Max: 5.510 [LOSS Ex2] A: 0.12722 | B: 0.36023 | C: 0.24143 ** [JOINT LOSS] ** : 0.880313 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002245 | Grad Max: 0.051559 -> Layer: shared_layers.0.bias | Grad Mean: 0.082108 | Grad Max: 0.585899 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.005935 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006808 | Grad Max: 0.006808 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000597 | Grad Max: 0.097763 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010255 | Grad Max: 0.509453 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002833 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001802 | Grad Max: 0.016736 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000166 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000297 | Grad Max: 0.002063 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000076 | Grad Max: 0.000542 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.000942 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000106 | Grad Max: 0.000106 [GRADIENT NORM TOTAL] 1.9875 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.113 | Max: 0.618 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54285675 0.45714328] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.064 [MASKS] A(Pass/Fail): 667/1381 | B: 613/1435 | C: 472/1576 [LOSS Ex1] A: 0.64829 | B: 0.63576 | C: 0.63155 [LOGITS Ex2 A] Mean Abs: 1.972 | Max: 5.889 [LOSS Ex2] A: 0.13702 | B: 0.35788 | C: 0.25902 ** [JOINT LOSS] ** : 0.889834 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003285 | Grad Max: 0.109914 -> Layer: shared_layers.0.bias | Grad Mean: 0.129786 | Grad Max: 0.651775 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.005853 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007357 | Grad Max: 0.007357 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001006 | Grad Max: 0.224363 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017123 | Grad Max: 1.248864 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000083 | Grad Max: 0.004963 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004741 | Grad Max: 0.036310 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000188 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000841 | Grad Max: 0.003578 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000069 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000220 | Grad Max: 0.000795 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000352 | Grad Max: 0.000950 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003912 | Grad Max: 0.003912 [GRADIENT NORM TOTAL] 3.4073 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.779 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7366611 0.26333892] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.067 [MASKS] A(Pass/Fail): 722/1326 | B: 615/1433 | C: 511/1537 [LOSS Ex1] A: 0.64261 | B: 0.63147 | C: 0.62404 [LOGITS Ex2 A] Mean Abs: 2.020 | Max: 5.884 [LOSS Ex2] A: 0.11526 | B: 0.34628 | C: 0.26260 ** [JOINT LOSS] ** : 0.874087 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003082 | Grad Max: 0.066919 -> Layer: shared_layers.0.bias | Grad Mean: 0.124325 | Grad Max: 0.460529 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002325 | Grad Max: 0.006659 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001158 | Grad Max: 0.001158 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000926 | Grad Max: 0.154986 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017218 | Grad Max: 0.864799 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.004569 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008072 | Grad Max: 0.042038 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000263 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001670 | Grad Max: 0.004791 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000125 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000434 | Grad Max: 0.001148 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000471 | Grad Max: 0.001461 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008143 | Grad Max: 0.008143 [GRADIENT NORM TOTAL] 2.8305 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.865 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008229 0.49917713] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.066 [MASKS] A(Pass/Fail): 705/1343 | B: 577/1279 | C: 449/1599 [LOSS Ex1] A: 0.64898 | B: 0.63571 | C: 0.63036 [LOGITS Ex2 A] Mean Abs: 2.045 | Max: 6.122 [LOSS Ex2] A: 0.12510 | B: 0.33710 | C: 0.27229 ** [JOINT LOSS] ** : 0.883177 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001905 | Grad Max: 0.064026 -> Layer: shared_layers.0.bias | Grad Mean: 0.134315 | Grad Max: 0.725793 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.005782 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002564 | Grad Max: 0.002564 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000995 | Grad Max: 0.198799 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017991 | Grad Max: 1.113783 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.006542 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007496 | Grad Max: 0.053893 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000200 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001442 | Grad Max: 0.004185 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000376 | Grad Max: 0.001240 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000297 | Grad Max: 0.001213 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006868 | Grad Max: 0.006868 [GRADIENT NORM TOTAL] 3.2013 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.597 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.68784106 0.31215897] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.067 [MASKS] A(Pass/Fail): 690/1358 | B: 615/1433 | C: 452/1596 [LOSS Ex1] A: 0.64504 | B: 0.63481 | C: 0.63239 [LOGITS Ex2 A] Mean Abs: 2.026 | Max: 6.399 [LOSS Ex2] A: 0.14482 | B: 0.35572 | C: 0.25875 ** [JOINT LOSS] ** : 0.890509 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002038 | Grad Max: 0.037767 -> Layer: shared_layers.0.bias | Grad Mean: 0.088189 | Grad Max: 0.382013 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.005902 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001425 | Grad Max: 0.001425 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000654 | Grad Max: 0.457886 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011308 | Grad Max: 2.543863 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002419 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002039 | Grad Max: 0.018893 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000150 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000353 | Grad Max: 0.001975 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000058 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000095 | Grad Max: 0.000503 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000228 | Grad Max: 0.000863 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001772 | Grad Max: 0.001772 [GRADIENT NORM TOTAL] 3.5553 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.696 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61554265 0.38445738] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.067 [MASKS] A(Pass/Fail): 582/1034 | B: 613/1435 | C: 313/1063 [LOSS Ex1] A: 0.64334 | B: 0.63553 | C: 0.62897 [LOGITS Ex2 A] Mean Abs: 2.082 | Max: 7.467 [LOSS Ex2] A: 0.12561 | B: 0.35612 | C: 0.25399 ** [JOINT LOSS] ** : 0.881186 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002313 | Grad Max: 0.038081 -> Layer: shared_layers.0.bias | Grad Mean: 0.107075 | Grad Max: 0.529798 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.006354 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002872 | Grad Max: 0.002872 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000778 | Grad Max: 0.195333 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013538 | Grad Max: 1.101705 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.004487 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003889 | Grad Max: 0.032440 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000174 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000763 | Grad Max: 0.003038 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000070 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000214 | Grad Max: 0.000781 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000333 | Grad Max: 0.001120 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004629 | Grad Max: 0.004629 [GRADIENT NORM TOTAL] 2.7648 [EPOCH SUMMARY] Train Loss: 0.8831 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8650 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8685 -> New: 0.8650) ############################## EPOCH 118/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.868 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073756 0.49262443] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.068 [MASKS] A(Pass/Fail): 702/1346 | B: 615/1433 | C: 476/1572 [LOSS Ex1] A: 0.64391 | B: 0.63123 | C: 0.62546 [LOGITS Ex2 A] Mean Abs: 2.079 | Max: 8.594 [LOSS Ex2] A: 0.12453 | B: 0.34479 | C: 0.23734 ** [JOINT LOSS] ** : 0.869086 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003564 | Grad Max: 0.123815 -> Layer: shared_layers.0.bias | Grad Mean: 0.127870 | Grad Max: 0.624806 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002329 | Grad Max: 0.006112 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004159 | Grad Max: 0.004159 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001185 | Grad Max: 0.286979 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021083 | Grad Max: 1.574776 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000130 | Grad Max: 0.004044 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008150 | Grad Max: 0.035002 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000272 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001840 | Grad Max: 0.004662 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000128 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000506 | Grad Max: 0.001191 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000508 | Grad Max: 0.001611 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010120 | Grad Max: 0.010120 [GRADIENT NORM TOTAL] 3.7276 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.817 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5106333 0.48936668] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.067 [MASKS] A(Pass/Fail): 692/1356 | B: 578/1278 | C: 484/1564 [LOSS Ex1] A: 0.64097 | B: 0.63547 | C: 0.62903 [LOGITS Ex2 A] Mean Abs: 2.076 | Max: 6.126 [LOSS Ex2] A: 0.13766 | B: 0.33150 | C: 0.29036 ** [JOINT LOSS] ** : 0.888326 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.042046 -> Layer: shared_layers.0.bias | Grad Mean: 0.101142 | Grad Max: 0.531291 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002263 | Grad Max: 0.006226 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004372 | Grad Max: 0.004372 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000832 | Grad Max: 0.166197 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014489 | Grad Max: 0.894474 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.004391 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004247 | Grad Max: 0.033946 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000182 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000913 | Grad Max: 0.002952 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000074 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000268 | Grad Max: 0.001039 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000198 | Grad Max: 0.000626 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005498 | Grad Max: 0.005498 [GRADIENT NORM TOTAL] 2.5857 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.843 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5040401 0.49595985] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.067 [MASKS] A(Pass/Fail): 694/1354 | B: 618/1430 | C: 482/1566 [LOSS Ex1] A: 0.63996 | B: 0.63457 | C: 0.62932 [LOGITS Ex2 A] Mean Abs: 2.062 | Max: 7.392 [LOSS Ex2] A: 0.13704 | B: 0.36123 | C: 0.23973 ** [JOINT LOSS] ** : 0.880616 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002386 | Grad Max: 0.062750 -> Layer: shared_layers.0.bias | Grad Mean: 0.062765 | Grad Max: 0.277261 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.006702 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002735 | Grad Max: 0.002735 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000665 | Grad Max: 0.158514 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011061 | Grad Max: 0.870404 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.002454 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002186 | Grad Max: 0.021681 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000115 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000380 | Grad Max: 0.002361 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000077 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000107 | Grad Max: 0.000725 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000271 | Grad Max: 0.001052 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002104 | Grad Max: 0.002104 [GRADIENT NORM TOTAL] 2.1017 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.699 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50252694 0.49747303] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.066 [MASKS] A(Pass/Fail): 669/1379 | B: 613/1435 | C: 475/1573 [LOSS Ex1] A: 0.64801 | B: 0.63528 | C: 0.63063 [LOGITS Ex2 A] Mean Abs: 2.059 | Max: 6.622 [LOSS Ex2] A: 0.12278 | B: 0.35417 | C: 0.25991 ** [JOINT LOSS] ** : 0.883596 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003193 | Grad Max: 0.117395 -> Layer: shared_layers.0.bias | Grad Mean: 0.228976 | Grad Max: 1.395059 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002042 | Grad Max: 0.006065 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008917 | Grad Max: 0.008917 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001463 | Grad Max: 0.257280 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025811 | Grad Max: 1.435367 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000142 | Grad Max: 0.007003 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008843 | Grad Max: 0.070127 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000242 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001634 | Grad Max: 0.005038 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000094 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000401 | Grad Max: 0.001157 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000321 | Grad Max: 0.001090 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006301 | Grad Max: 0.006301 [GRADIENT NORM TOTAL] 5.1017 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.623 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.542925 0.45707506] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.064 [MASKS] A(Pass/Fail): 667/1381 | B: 615/1433 | C: 460/1588 [LOSS Ex1] A: 0.64801 | B: 0.63098 | C: 0.62984 [LOGITS Ex2 A] Mean Abs: 2.019 | Max: 5.710 [LOSS Ex2] A: 0.12503 | B: 0.34189 | C: 0.25494 ** [JOINT LOSS] ** : 0.876896 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004426 | Grad Max: 0.160879 -> Layer: shared_layers.0.bias | Grad Mean: 0.150860 | Grad Max: 0.766975 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002147 | Grad Max: 0.006265 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009393 | Grad Max: 0.009393 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001177 | Grad Max: 0.194647 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019706 | Grad Max: 1.012411 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.006114 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005883 | Grad Max: 0.051696 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000179 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001046 | Grad Max: 0.003218 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000103 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000282 | Grad Max: 0.000969 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000379 | Grad Max: 0.001280 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005297 | Grad Max: 0.005297 [GRADIENT NORM TOTAL] 3.5320 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.786 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7385293 0.26147068] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.068 [MASKS] A(Pass/Fail): 724/1324 | B: 578/1278 | C: 477/1571 [LOSS Ex1] A: 0.64230 | B: 0.63521 | C: 0.62856 [LOGITS Ex2 A] Mean Abs: 2.044 | Max: 6.314 [LOSS Ex2] A: 0.13149 | B: 0.34454 | C: 0.27042 ** [JOINT LOSS] ** : 0.884171 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005466 | Grad Max: 0.166292 -> Layer: shared_layers.0.bias | Grad Mean: 0.290797 | Grad Max: 1.089170 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002264 | Grad Max: 0.006024 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005556 | Grad Max: 0.005556 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.266292 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034392 | Grad Max: 1.495508 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.008605 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018161 | Grad Max: 0.080530 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000402 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003950 | Grad Max: 0.008601 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000228 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001041 | Grad Max: 0.002686 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000915 | Grad Max: 0.001793 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019534 | Grad Max: 0.019534 [GRADIENT NORM TOTAL] 5.6737 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.873 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008069 0.49919307] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.067 [MASKS] A(Pass/Fail): 705/1343 | B: 620/1428 | C: 479/1569 [LOSS Ex1] A: 0.64868 | B: 0.63432 | C: 0.62772 [LOGITS Ex2 A] Mean Abs: 2.061 | Max: 5.831 [LOSS Ex2] A: 0.11687 | B: 0.35845 | C: 0.26304 ** [JOINT LOSS] ** : 0.883024 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004430 | Grad Max: 0.128560 -> Layer: shared_layers.0.bias | Grad Mean: 0.173631 | Grad Max: 0.818400 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002179 | Grad Max: 0.005690 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003337 | Grad Max: 0.003337 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001180 | Grad Max: 0.535256 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021201 | Grad Max: 2.992258 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.005949 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008098 | Grad Max: 0.051575 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000265 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001871 | Grad Max: 0.005031 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000115 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000515 | Grad Max: 0.001337 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000522 | Grad Max: 0.001490 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010415 | Grad Max: 0.010415 [GRADIENT NORM TOTAL] 4.8991 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.603 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6890237 0.3109764] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.067 [MASKS] A(Pass/Fail): 690/1358 | B: 613/1435 | C: 470/1578 [LOSS Ex1] A: 0.64471 | B: 0.63504 | C: 0.63199 [LOGITS Ex2 A] Mean Abs: 2.071 | Max: 5.784 [LOSS Ex2] A: 0.14455 | B: 0.35215 | C: 0.27540 ** [JOINT LOSS] ** : 0.894619 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003459 | Grad Max: 0.115831 -> Layer: shared_layers.0.bias | Grad Mean: 0.298520 | Grad Max: 1.532266 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.005909 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007841 | Grad Max: 0.007841 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.336028 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036836 | Grad Max: 1.888932 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000251 | Grad Max: 0.007852 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016768 | Grad Max: 0.071836 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000368 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003511 | Grad Max: 0.007969 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000182 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000882 | Grad Max: 0.002330 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000698 | Grad Max: 0.001696 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014771 | Grad Max: 0.014771 [GRADIENT NORM TOTAL] 6.8503 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.702 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61610585 0.38389418] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.068 [MASKS] A(Pass/Fail): 582/1034 | B: 615/1433 | C: 488/1560 [LOSS Ex1] A: 0.64300 | B: 0.63074 | C: 0.62621 [LOGITS Ex2 A] Mean Abs: 2.112 | Max: 6.304 [LOSS Ex2] A: 0.12166 | B: 0.33325 | C: 0.23975 ** [JOINT LOSS] ** : 0.864871 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004241 | Grad Max: 0.131772 -> Layer: shared_layers.0.bias | Grad Mean: 0.241557 | Grad Max: 0.928953 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002300 | Grad Max: 0.006336 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005991 | Grad Max: 0.005991 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001671 | Grad Max: 0.240553 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029924 | Grad Max: 1.237408 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.008245 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014858 | Grad Max: 0.081060 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000332 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003252 | Grad Max: 0.007119 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000169 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000844 | Grad Max: 0.002195 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000664 | Grad Max: 0.001918 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015208 | Grad Max: 0.015208 [GRADIENT NORM TOTAL] 5.0880 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.875 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074171 0.4925829] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.068 [MASKS] A(Pass/Fail): 702/1346 | B: 578/1278 | C: 480/1568 [LOSS Ex1] A: 0.64358 | B: 0.63499 | C: 0.62862 [LOGITS Ex2 A] Mean Abs: 2.077 | Max: 7.743 [LOSS Ex2] A: 0.12222 | B: 0.33976 | C: 0.27169 ** [JOINT LOSS] ** : 0.880285 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003411 | Grad Max: 0.120736 -> Layer: shared_layers.0.bias | Grad Mean: 0.217723 | Grad Max: 1.111595 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002233 | Grad Max: 0.006030 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004413 | Grad Max: 0.004413 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001474 | Grad Max: 0.240977 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026125 | Grad Max: 1.318183 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000175 | Grad Max: 0.007883 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011265 | Grad Max: 0.081759 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000260 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002085 | Grad Max: 0.004944 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000129 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000539 | Grad Max: 0.001631 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000440 | Grad Max: 0.001342 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010285 | Grad Max: 0.010285 [GRADIENT NORM TOTAL] 4.8720 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.824 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51054645 0.48945358] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.068 [MASKS] A(Pass/Fail): 692/1356 | B: 620/1428 | C: 468/1580 [LOSS Ex1] A: 0.64063 | B: 0.63411 | C: 0.63061 [LOGITS Ex2 A] Mean Abs: 2.064 | Max: 6.448 [LOSS Ex2] A: 0.12333 | B: 0.36329 | C: 0.26403 ** [JOINT LOSS] ** : 0.885336 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004258 | Grad Max: 0.129563 -> Layer: shared_layers.0.bias | Grad Mean: 0.338202 | Grad Max: 1.628029 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002219 | Grad Max: 0.006263 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000715 | Grad Max: 0.000715 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002226 | Grad Max: 0.363811 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040319 | Grad Max: 1.976565 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000282 | Grad Max: 0.011575 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018967 | Grad Max: 0.124275 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000406 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003759 | Grad Max: 0.008381 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000168 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000998 | Grad Max: 0.002274 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000840 | Grad Max: 0.001910 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019657 | Grad Max: 0.019657 [GRADIENT NORM TOTAL] 7.6177 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.850 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50418794 0.4958121 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.067 [MASKS] A(Pass/Fail): 695/1353 | B: 613/1435 | C: 457/1591 [LOSS Ex1] A: 0.63961 | B: 0.63484 | C: 0.63021 [LOGITS Ex2 A] Mean Abs: 2.055 | Max: 6.658 [LOSS Ex2] A: 0.14222 | B: 0.35407 | C: 0.26782 ** [JOINT LOSS] ** : 0.889590 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005155 | Grad Max: 0.265093 -> Layer: shared_layers.0.bias | Grad Mean: 0.180928 | Grad Max: 0.724091 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002273 | Grad Max: 0.006190 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004134 | Grad Max: 0.004134 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001389 | Grad Max: 0.236053 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023536 | Grad Max: 1.330280 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000152 | Grad Max: 0.006025 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009470 | Grad Max: 0.052010 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000269 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002226 | Grad Max: 0.005432 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000128 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000562 | Grad Max: 0.001394 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000473 | Grad Max: 0.001453 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009934 | Grad Max: 0.009934 [GRADIENT NORM TOTAL] 4.2578 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.705 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5025715 0.4974285] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.066 [MASKS] A(Pass/Fail): 669/1379 | B: 615/1433 | C: 465/1583 [LOSS Ex1] A: 0.64771 | B: 0.63054 | C: 0.62708 [LOGITS Ex2 A] Mean Abs: 2.043 | Max: 6.659 [LOSS Ex2] A: 0.12550 | B: 0.34319 | C: 0.23397 ** [JOINT LOSS] ** : 0.869329 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002533 | Grad Max: 0.087508 -> Layer: shared_layers.0.bias | Grad Mean: 0.121051 | Grad Max: 0.699329 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.005677 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004151 | Grad Max: 0.004151 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000785 | Grad Max: 0.197391 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013721 | Grad Max: 1.099622 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003207 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002800 | Grad Max: 0.020416 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000155 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000624 | Grad Max: 0.002599 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000068 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000178 | Grad Max: 0.000803 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000429 | Grad Max: 0.001092 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003586 | Grad Max: 0.003586 [GRADIENT NORM TOTAL] 3.0923 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.114 | Max: 0.629 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5429262 0.45707378] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.064 [MASKS] A(Pass/Fail): 667/1381 | B: 578/1278 | C: 308/1068 [LOSS Ex1] A: 0.64772 | B: 0.63480 | C: 0.63120 [LOGITS Ex2 A] Mean Abs: 2.031 | Max: 6.584 [LOSS Ex2] A: 0.13131 | B: 0.33552 | C: 0.25513 ** [JOINT LOSS] ** : 0.878561 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003778 | Grad Max: 0.156519 -> Layer: shared_layers.0.bias | Grad Mean: 0.119198 | Grad Max: 0.575449 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002090 | Grad Max: 0.005656 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001120 | Grad Max: 0.001120 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000994 | Grad Max: 0.167494 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016780 | Grad Max: 0.931581 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.004372 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005228 | Grad Max: 0.035128 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000220 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001227 | Grad Max: 0.004077 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000289 | Grad Max: 0.001169 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000348 | Grad Max: 0.001153 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004470 | Grad Max: 0.004470 [GRADIENT NORM TOTAL] 2.9734 [EPOCH SUMMARY] Train Loss: 0.8806 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8638 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8650 -> New: 0.8638) ############################## EPOCH 119/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.792 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7401366 0.25986338] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.068 [MASKS] A(Pass/Fail): 726/1322 | B: 620/1428 | C: 460/1588 [LOSS Ex1] A: 0.64200 | B: 0.63391 | C: 0.63130 [LOGITS Ex2 A] Mean Abs: 2.081 | Max: 6.505 [LOSS Ex2] A: 0.12977 | B: 0.35641 | C: 0.25631 ** [JOINT LOSS] ** : 0.883234 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002855 | Grad Max: 0.093757 -> Layer: shared_layers.0.bias | Grad Mean: 0.098171 | Grad Max: 0.418892 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.005765 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001128 | Grad Max: 0.001128 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000812 | Grad Max: 0.173357 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013714 | Grad Max: 0.977061 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.004021 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004009 | Grad Max: 0.030381 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000194 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000969 | Grad Max: 0.003301 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000092 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000253 | Grad Max: 0.001114 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000282 | Grad Max: 0.001117 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004306 | Grad Max: 0.004306 [GRADIENT NORM TOTAL] 2.8156 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.880 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50075 0.49925] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.067 [MASKS] A(Pass/Fail): 709/1339 | B: 613/1435 | C: 484/1564 [LOSS Ex1] A: 0.64841 | B: 0.63463 | C: 0.63095 [LOGITS Ex2 A] Mean Abs: 2.108 | Max: 6.293 [LOSS Ex2] A: 0.11849 | B: 0.35320 | C: 0.27064 ** [JOINT LOSS] ** : 0.885440 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003382 | Grad Max: 0.138313 -> Layer: shared_layers.0.bias | Grad Mean: 0.333505 | Grad Max: 1.616851 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.005810 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002531 | Grad Max: 0.002531 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002237 | Grad Max: 0.319077 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040859 | Grad Max: 1.754198 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.013062 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019659 | Grad Max: 0.124852 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000390 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003956 | Grad Max: 0.008741 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000184 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001034 | Grad Max: 0.002384 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000790 | Grad Max: 0.002020 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018850 | Grad Max: 0.018850 [GRADIENT NORM TOTAL] 7.2545 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.608 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.69003665 0.30996335] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.068 [MASKS] A(Pass/Fail): 694/1354 | B: 615/1433 | C: 460/1588 [LOSS Ex1] A: 0.64442 | B: 0.63033 | C: 0.63360 [LOGITS Ex2 A] Mean Abs: 2.086 | Max: 6.844 [LOSS Ex2] A: 0.14404 | B: 0.34545 | C: 0.28264 ** [JOINT LOSS] ** : 0.893496 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002753 | Grad Max: 0.064510 -> Layer: shared_layers.0.bias | Grad Mean: 0.182465 | Grad Max: 0.707772 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.005721 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001224 | Grad Max: 0.001224 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001390 | Grad Max: 0.165764 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024897 | Grad Max: 0.896910 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000177 | Grad Max: 0.007745 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011955 | Grad Max: 0.070349 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000298 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002415 | Grad Max: 0.005928 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000137 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000609 | Grad Max: 0.001889 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000427 | Grad Max: 0.001097 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009946 | Grad Max: 0.009946 [GRADIENT NORM TOTAL] 4.0795 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.708 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6167099 0.3832901] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.068 [MASKS] A(Pass/Fail): 586/1030 | B: 578/1278 | C: 486/1562 [LOSS Ex1] A: 0.64272 | B: 0.63458 | C: 0.62600 [LOGITS Ex2 A] Mean Abs: 2.098 | Max: 7.227 [LOSS Ex2] A: 0.11618 | B: 0.35130 | C: 0.25887 ** [JOINT LOSS] ** : 0.876552 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005784 | Grad Max: 0.153037 -> Layer: shared_layers.0.bias | Grad Mean: 0.401699 | Grad Max: 1.675073 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.006042 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003441 | Grad Max: 0.003441 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002456 | Grad Max: 0.385146 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045217 | Grad Max: 2.153286 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000346 | Grad Max: 0.013510 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023356 | Grad Max: 0.124863 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000462 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004957 | Grad Max: 0.010328 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000240 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001301 | Grad Max: 0.003315 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001106 | Grad Max: 0.002358 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024243 | Grad Max: 0.024243 [GRADIENT NORM TOTAL] 8.0991 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.881 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073905 0.49260947] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.069 [MASKS] A(Pass/Fail): 707/1341 | B: 620/1428 | C: 487/1561 [LOSS Ex1] A: 0.64330 | B: 0.63371 | C: 0.63196 [LOGITS Ex2 A] Mean Abs: 2.072 | Max: 7.718 [LOSS Ex2] A: 0.12607 | B: 0.37390 | C: 0.25519 ** [JOINT LOSS] ** : 0.888041 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004887 | Grad Max: 0.158186 -> Layer: shared_layers.0.bias | Grad Mean: 0.493235 | Grad Max: 2.098393 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002140 | Grad Max: 0.006153 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000051 | Grad Max: 0.000051 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002989 | Grad Max: 0.501698 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055909 | Grad Max: 2.814673 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000421 | Grad Max: 0.015660 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028751 | Grad Max: 0.162421 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000505 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005925 | Grad Max: 0.011934 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000268 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001549 | Grad Max: 0.003518 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001335 | Grad Max: 0.002616 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029788 | Grad Max: 0.029788 [GRADIENT NORM TOTAL] 10.0876 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.830 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105035 0.48949653] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.068 [MASKS] A(Pass/Fail): 695/1353 | B: 613/1435 | C: 496/1552 [LOSS Ex1] A: 0.64034 | B: 0.63443 | C: 0.62559 [LOGITS Ex2 A] Mean Abs: 2.083 | Max: 5.864 [LOSS Ex2] A: 0.13229 | B: 0.35066 | C: 0.24295 ** [JOINT LOSS] ** : 0.875423 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003773 | Grad Max: 0.137755 -> Layer: shared_layers.0.bias | Grad Mean: 0.088209 | Grad Max: 0.576490 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.006405 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002432 | Grad Max: 0.002432 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000838 | Grad Max: 0.108461 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013554 | Grad Max: 0.584640 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.003706 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002549 | Grad Max: 0.031951 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000166 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000290 | Grad Max: 0.001803 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000061 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000074 | Grad Max: 0.000403 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000302 | Grad Max: 0.000794 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000250 | Grad Max: 0.000250 [GRADIENT NORM TOTAL] 2.2569 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.856 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041989 0.49580115] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.068 [MASKS] A(Pass/Fail): 699/1349 | B: 616/1432 | C: 473/1575 [LOSS Ex1] A: 0.63933 | B: 0.63013 | C: 0.62865 [LOGITS Ex2 A] Mean Abs: 2.089 | Max: 6.991 [LOSS Ex2] A: 0.14932 | B: 0.33467 | C: 0.25047 ** [JOINT LOSS] ** : 0.877522 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007799 | Grad Max: 0.258655 -> Layer: shared_layers.0.bias | Grad Mean: 0.451859 | Grad Max: 1.987425 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002318 | Grad Max: 0.006324 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001687 | Grad Max: 0.001687 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003118 | Grad Max: 0.369707 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057066 | Grad Max: 2.104752 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000429 | Grad Max: 0.013432 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028482 | Grad Max: 0.150955 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000558 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006050 | Grad Max: 0.012223 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000286 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001565 | Grad Max: 0.003841 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001322 | Grad Max: 0.002803 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028933 | Grad Max: 0.028933 [GRADIENT NORM TOTAL] 9.6278 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.710 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50246924 0.4975308 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.067 [MASKS] A(Pass/Fail): 675/1373 | B: 577/1279 | C: 455/1593 [LOSS Ex1] A: 0.64745 | B: 0.63439 | C: 0.62772 [LOGITS Ex2 A] Mean Abs: 2.073 | Max: 6.222 [LOSS Ex2] A: 0.12009 | B: 0.33114 | C: 0.25040 ** [JOINT LOSS] ** : 0.870401 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004943 | Grad Max: 0.124111 -> Layer: shared_layers.0.bias | Grad Mean: 0.317073 | Grad Max: 1.506788 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.006086 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000525 | Grad Max: 0.000525 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.317223 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039538 | Grad Max: 1.786360 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000293 | Grad Max: 0.011063 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019730 | Grad Max: 0.115256 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000363 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004123 | Grad Max: 0.008950 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000192 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001077 | Grad Max: 0.002411 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000861 | Grad Max: 0.002406 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019524 | Grad Max: 0.019524 [GRADIENT NORM TOTAL] 6.9299 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.633 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54301065 0.45698932] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.551 | Std: 0.065 [MASKS] A(Pass/Fail): 672/1376 | B: 621/1427 | C: 466/1582 [LOSS Ex1] A: 0.64747 | B: 0.63352 | C: 0.62658 [LOGITS Ex2 A] Mean Abs: 2.003 | Max: 5.979 [LOSS Ex2] A: 0.13408 | B: 0.35564 | C: 0.24058 ** [JOINT LOSS] ** : 0.879294 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004970 | Grad Max: 0.159923 -> Layer: shared_layers.0.bias | Grad Mean: 0.259203 | Grad Max: 1.244950 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.005927 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004473 | Grad Max: 0.004473 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001693 | Grad Max: 0.309774 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030595 | Grad Max: 1.744408 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000230 | Grad Max: 0.007562 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015296 | Grad Max: 0.074676 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000321 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003302 | Grad Max: 0.007543 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000207 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000852 | Grad Max: 0.002415 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000688 | Grad Max: 0.001909 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015251 | Grad Max: 0.015251 [GRADIENT NORM TOTAL] 5.3860 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.798 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7416169 0.25838315] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.068 [MASKS] A(Pass/Fail): 727/1321 | B: 616/1432 | C: 480/1568 [LOSS Ex1] A: 0.64173 | B: 0.63425 | C: 0.62545 [LOGITS Ex2 A] Mean Abs: 2.062 | Max: 6.161 [LOSS Ex2] A: 0.12674 | B: 0.35867 | C: 0.24695 ** [JOINT LOSS] ** : 0.877927 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003813 | Grad Max: 0.088999 -> Layer: shared_layers.0.bias | Grad Mean: 0.223648 | Grad Max: 1.194478 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.006242 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004686 | Grad Max: 0.004686 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001392 | Grad Max: 0.298440 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025679 | Grad Max: 1.669621 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.006691 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012463 | Grad Max: 0.071729 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000288 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002644 | Grad Max: 0.006347 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000155 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000690 | Grad Max: 0.001841 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000610 | Grad Max: 0.001788 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013198 | Grad Max: 0.013198 [GRADIENT NORM TOTAL] 4.7492 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.885 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50080824 0.49919173] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.067 [MASKS] A(Pass/Fail): 709/1339 | B: 616/1432 | C: 494/1554 [LOSS Ex1] A: 0.64817 | B: 0.62994 | C: 0.62606 [LOGITS Ex2 A] Mean Abs: 2.083 | Max: 6.063 [LOSS Ex2] A: 0.12174 | B: 0.33989 | C: 0.27540 ** [JOINT LOSS] ** : 0.880400 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003556 | Grad Max: 0.107292 -> Layer: shared_layers.0.bias | Grad Mean: 0.256378 | Grad Max: 1.260667 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002200 | Grad Max: 0.005633 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000722 | Grad Max: 0.000722 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001819 | Grad Max: 0.308335 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032356 | Grad Max: 1.724711 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.008540 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015010 | Grad Max: 0.099568 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000314 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002920 | Grad Max: 0.007341 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000133 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000752 | Grad Max: 0.001791 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000523 | Grad Max: 0.001538 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013181 | Grad Max: 0.013181 [GRADIENT NORM TOTAL] 5.8702 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.613 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6910062 0.3089938] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.068 [MASKS] A(Pass/Fail): 694/1354 | B: 577/1279 | C: 475/1573 [LOSS Ex1] A: 0.64417 | B: 0.63420 | C: 0.62812 [LOGITS Ex2 A] Mean Abs: 2.066 | Max: 6.347 [LOSS Ex2] A: 0.14015 | B: 0.32922 | C: 0.27711 ** [JOINT LOSS] ** : 0.884326 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003317 | Grad Max: 0.092843 -> Layer: shared_layers.0.bias | Grad Mean: 0.144288 | Grad Max: 0.583152 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.005954 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004854 | Grad Max: 0.004854 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001103 | Grad Max: 0.191674 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018812 | Grad Max: 1.032359 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000096 | Grad Max: 0.004733 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005835 | Grad Max: 0.044342 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000187 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001085 | Grad Max: 0.003963 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000273 | Grad Max: 0.001052 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000203 | Grad Max: 0.000813 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004374 | Grad Max: 0.004374 [GRADIENT NORM TOTAL] 3.2137 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.713 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61721504 0.382785 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.068 [MASKS] A(Pass/Fail): 586/1030 | B: 623/1425 | C: 472/1576 [LOSS Ex1] A: 0.64246 | B: 0.63335 | C: 0.62832 [LOGITS Ex2 A] Mean Abs: 2.084 | Max: 8.261 [LOSS Ex2] A: 0.12908 | B: 0.36206 | C: 0.24894 ** [JOINT LOSS] ** : 0.881402 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004346 | Grad Max: 0.099637 -> Layer: shared_layers.0.bias | Grad Mean: 0.264310 | Grad Max: 1.333746 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.006060 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003614 | Grad Max: 0.003614 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001789 | Grad Max: 0.363612 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032547 | Grad Max: 2.043206 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.009036 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014838 | Grad Max: 0.087668 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000365 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003171 | Grad Max: 0.006945 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000155 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000842 | Grad Max: 0.002040 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000754 | Grad Max: 0.001812 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016603 | Grad Max: 0.016603 [GRADIENT NORM TOTAL] 6.1324 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.887 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074149 0.4925851] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.069 [MASKS] A(Pass/Fail): 707/1341 | B: 616/1432 | C: 323/1053 [LOSS Ex1] A: 0.64304 | B: 0.63407 | C: 0.62456 [LOGITS Ex2 A] Mean Abs: 2.091 | Max: 6.523 [LOSS Ex2] A: 0.11661 | B: 0.35810 | C: 0.23919 ** [JOINT LOSS] ** : 0.871858 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002306 | Grad Max: 0.064889 -> Layer: shared_layers.0.bias | Grad Mean: 0.095471 | Grad Max: 0.494144 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.005559 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000356 | Grad Max: 0.000356 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000785 | Grad Max: 0.276631 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013588 | Grad Max: 1.513432 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.004671 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005460 | Grad Max: 0.044764 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000210 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001074 | Grad Max: 0.003736 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000104 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000293 | Grad Max: 0.000894 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000366 | Grad Max: 0.001366 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006139 | Grad Max: 0.006139 [GRADIENT NORM TOTAL] 2.8484 [EPOCH SUMMARY] Train Loss: 0.8804 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8631 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8638 -> New: 0.8631) ############################## EPOCH 120/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.836 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51043075 0.48956928] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.068 [MASKS] A(Pass/Fail): 695/1353 | B: 617/1431 | C: 486/1562 [LOSS Ex1] A: 0.64007 | B: 0.62977 | C: 0.62807 [LOGITS Ex2 A] Mean Abs: 2.098 | Max: 5.670 [LOSS Ex2] A: 0.14046 | B: 0.32778 | C: 0.25866 ** [JOINT LOSS] ** : 0.874939 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008478 | Grad Max: 0.264695 -> Layer: shared_layers.0.bias | Grad Mean: 0.398476 | Grad Max: 1.763213 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002385 | Grad Max: 0.006270 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007877 | Grad Max: 0.007877 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002679 | Grad Max: 0.329524 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049034 | Grad Max: 1.816707 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000375 | Grad Max: 0.012778 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024841 | Grad Max: 0.143888 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000577 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005373 | Grad Max: 0.011460 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000279 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001406 | Grad Max: 0.003260 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001226 | Grad Max: 0.002631 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026453 | Grad Max: 0.026453 [GRADIENT NORM TOTAL] 7.9447 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.862 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5042654 0.49573457] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.068 [MASKS] A(Pass/Fail): 699/1349 | B: 579/1277 | C: 481/1567 [LOSS Ex1] A: 0.63906 | B: 0.63403 | C: 0.62651 [LOGITS Ex2 A] Mean Abs: 2.073 | Max: 8.092 [LOSS Ex2] A: 0.15024 | B: 0.33400 | C: 0.25283 ** [JOINT LOSS] ** : 0.878892 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006604 | Grad Max: 0.283503 -> Layer: shared_layers.0.bias | Grad Mean: 0.149921 | Grad Max: 0.474290 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.006598 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005939 | Grad Max: 0.005939 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001267 | Grad Max: 0.248300 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021279 | Grad Max: 1.401527 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000135 | Grad Max: 0.005334 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008107 | Grad Max: 0.042681 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000300 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001949 | Grad Max: 0.005346 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000119 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000493 | Grad Max: 0.001319 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000455 | Grad Max: 0.001347 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008507 | Grad Max: 0.008507 [GRADIENT NORM TOTAL] 4.0005 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.714 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5024284 0.49757156] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.552 | Std: 0.067 [MASKS] A(Pass/Fail): 675/1373 | B: 625/1423 | C: 487/1561 [LOSS Ex1] A: 0.64720 | B: 0.63318 | C: 0.62444 [LOGITS Ex2 A] Mean Abs: 2.017 | Max: 6.230 [LOSS Ex2] A: 0.11906 | B: 0.36403 | C: 0.24395 ** [JOINT LOSS] ** : 0.877288 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002449 | Grad Max: 0.122677 -> Layer: shared_layers.0.bias | Grad Mean: 0.349526 | Grad Max: 1.602035 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.006514 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010257 | Grad Max: 0.010257 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.379170 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040987 | Grad Max: 2.140888 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000270 | Grad Max: 0.011813 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018749 | Grad Max: 0.108510 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000382 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003853 | Grad Max: 0.008358 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000172 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001007 | Grad Max: 0.002446 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000779 | Grad Max: 0.002000 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018274 | Grad Max: 0.018274 [GRADIENT NORM TOTAL] 7.8395 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.115 | Max: 0.638 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54305553 0.45694444] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.065 [MASKS] A(Pass/Fail): 672/1376 | B: 616/1432 | C: 528/1520 [LOSS Ex1] A: 0.64723 | B: 0.63390 | C: 0.62230 [LOGITS Ex2 A] Mean Abs: 2.019 | Max: 6.153 [LOSS Ex2] A: 0.12692 | B: 0.35637 | C: 0.24861 ** [JOINT LOSS] ** : 0.878444 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006412 | Grad Max: 0.172999 -> Layer: shared_layers.0.bias | Grad Mean: 0.428342 | Grad Max: 1.639748 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002153 | Grad Max: 0.006931 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010590 | Grad Max: 0.010590 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002705 | Grad Max: 0.385128 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049725 | Grad Max: 2.174340 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000370 | Grad Max: 0.015013 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024852 | Grad Max: 0.135707 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000471 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005261 | Grad Max: 0.011194 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000246 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001350 | Grad Max: 0.003198 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001109 | Grad Max: 0.002484 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024715 | Grad Max: 0.024715 [GRADIENT NORM TOTAL] 8.4212 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.803 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.74309295 0.256907 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.069 [MASKS] A(Pass/Fail): 727/1321 | B: 618/1430 | C: 447/1601 [LOSS Ex1] A: 0.64147 | B: 0.62960 | C: 0.63280 [LOGITS Ex2 A] Mean Abs: 2.080 | Max: 5.952 [LOSS Ex2] A: 0.13019 | B: 0.33283 | C: 0.26167 ** [JOINT LOSS] ** : 0.876187 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002775 | Grad Max: 0.116137 -> Layer: shared_layers.0.bias | Grad Mean: 0.234785 | Grad Max: 1.170393 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002334 | Grad Max: 0.006250 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011066 | Grad Max: 0.011066 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001597 | Grad Max: 0.347664 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029161 | Grad Max: 1.944103 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000203 | Grad Max: 0.009121 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013646 | Grad Max: 0.091329 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000299 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002665 | Grad Max: 0.006309 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000127 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000714 | Grad Max: 0.001769 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000537 | Grad Max: 0.001706 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013262 | Grad Max: 0.013262 [GRADIENT NORM TOTAL] 5.5168 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.891 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50075966 0.4992403 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.068 [MASKS] A(Pass/Fail): 709/1339 | B: 579/1277 | C: 489/1559 [LOSS Ex1] A: 0.64793 | B: 0.63387 | C: 0.63203 [LOGITS Ex2 A] Mean Abs: 2.084 | Max: 6.093 [LOSS Ex2] A: 0.12078 | B: 0.33537 | C: 0.26488 ** [JOINT LOSS] ** : 0.878285 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003246 | Grad Max: 0.135111 -> Layer: shared_layers.0.bias | Grad Mean: 0.117706 | Grad Max: 0.648996 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005491 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003245 | Grad Max: 0.003245 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001034 | Grad Max: 0.182968 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017422 | Grad Max: 0.991508 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000089 | Grad Max: 0.005017 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004871 | Grad Max: 0.046235 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000189 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000744 | Grad Max: 0.002847 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000065 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000201 | Grad Max: 0.000845 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000328 | Grad Max: 0.000998 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003931 | Grad Max: 0.003931 [GRADIENT NORM TOTAL] 3.1265 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.617 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6919519 0.3080481] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.069 [MASKS] A(Pass/Fail): 696/1352 | B: 626/1422 | C: 473/1575 [LOSS Ex1] A: 0.64391 | B: 0.63302 | C: 0.62667 [LOGITS Ex2 A] Mean Abs: 2.054 | Max: 6.194 [LOSS Ex2] A: 0.14269 | B: 0.35484 | C: 0.24807 ** [JOINT LOSS] ** : 0.883063 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004645 | Grad Max: 0.148820 -> Layer: shared_layers.0.bias | Grad Mean: 0.127531 | Grad Max: 0.470890 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006440 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001294 | Grad Max: 0.001294 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001007 | Grad Max: 0.203931 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017177 | Grad Max: 0.971015 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000102 | Grad Max: 0.003457 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006157 | Grad Max: 0.030798 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000241 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001465 | Grad Max: 0.004648 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000109 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000384 | Grad Max: 0.001164 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000367 | Grad Max: 0.001231 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007268 | Grad Max: 0.007268 [GRADIENT NORM TOTAL] 3.2332 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.717 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61772144 0.3822785 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.069 [MASKS] A(Pass/Fail): 586/1030 | B: 616/1432 | C: 478/1570 [LOSS Ex1] A: 0.64219 | B: 0.63373 | C: 0.62673 [LOGITS Ex2 A] Mean Abs: 2.107 | Max: 6.670 [LOSS Ex2] A: 0.13135 | B: 0.35213 | C: 0.24580 ** [JOINT LOSS] ** : 0.877308 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.043146 -> Layer: shared_layers.0.bias | Grad Mean: 0.146076 | Grad Max: 0.575778 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.006582 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008559 | Grad Max: 0.008559 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000775 | Grad Max: 0.471884 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013578 | Grad Max: 2.632607 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.002975 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002039 | Grad Max: 0.022539 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000122 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000336 | Grad Max: 0.001992 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000054 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000092 | Grad Max: 0.000501 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000336 | Grad Max: 0.000956 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002260 | Grad Max: 0.002260 [GRADIENT NORM TOTAL] 4.3978 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.893 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50743437 0.49256563] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.069 [MASKS] A(Pass/Fail): 709/1339 | B: 618/1430 | C: 464/1584 [LOSS Ex1] A: 0.64278 | B: 0.62942 | C: 0.63090 [LOGITS Ex2 A] Mean Abs: 2.088 | Max: 8.077 [LOSS Ex2] A: 0.13150 | B: 0.33166 | C: 0.26512 ** [JOINT LOSS] ** : 0.877127 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004503 | Grad Max: 0.155450 -> Layer: shared_layers.0.bias | Grad Mean: 0.256991 | Grad Max: 0.843948 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002209 | Grad Max: 0.005952 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001341 | Grad Max: 0.001341 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001764 | Grad Max: 0.217039 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031723 | Grad Max: 1.214077 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000230 | Grad Max: 0.008917 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015539 | Grad Max: 0.087151 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000450 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003397 | Grad Max: 0.007455 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000151 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000888 | Grad Max: 0.001967 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000674 | Grad Max: 0.001873 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015806 | Grad Max: 0.015806 [GRADIENT NORM TOTAL] 5.1664 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.841 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104343 0.48956567] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.069 [MASKS] A(Pass/Fail): 697/1351 | B: 579/1277 | C: 474/1574 [LOSS Ex1] A: 0.63980 | B: 0.63369 | C: 0.62711 [LOGITS Ex2 A] Mean Abs: 2.074 | Max: 6.454 [LOSS Ex2] A: 0.12123 | B: 0.34160 | C: 0.25242 ** [JOINT LOSS] ** : 0.871946 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004138 | Grad Max: 0.170396 -> Layer: shared_layers.0.bias | Grad Mean: 0.191491 | Grad Max: 0.877641 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002330 | Grad Max: 0.006655 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007025 | Grad Max: 0.007025 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001335 | Grad Max: 0.194814 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022613 | Grad Max: 1.076523 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.006968 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007847 | Grad Max: 0.072500 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000227 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001379 | Grad Max: 0.004616 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000113 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000315 | Grad Max: 0.001171 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000296 | Grad Max: 0.000946 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004551 | Grad Max: 0.004551 [GRADIENT NORM TOTAL] 3.8714 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.868 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5043274 0.49567255] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.069 [MASKS] A(Pass/Fail): 700/1348 | B: 626/1422 | C: 498/1550 [LOSS Ex1] A: 0.63879 | B: 0.63283 | C: 0.62547 [LOGITS Ex2 A] Mean Abs: 2.043 | Max: 7.116 [LOSS Ex2] A: 0.14195 | B: 0.36034 | C: 0.24892 ** [JOINT LOSS] ** : 0.882763 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004016 | Grad Max: 0.158473 -> Layer: shared_layers.0.bias | Grad Mean: 0.127116 | Grad Max: 0.678799 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002283 | Grad Max: 0.006414 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001121 | Grad Max: 0.001121 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001085 | Grad Max: 0.244348 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017465 | Grad Max: 1.327662 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000096 | Grad Max: 0.005590 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005093 | Grad Max: 0.048640 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000139 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000836 | Grad Max: 0.003095 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000060 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000239 | Grad Max: 0.000757 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000278 | Grad Max: 0.000962 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005312 | Grad Max: 0.005312 [GRADIENT NORM TOTAL] 3.2307 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.719 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50237817 0.49762183] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.068 [MASKS] A(Pass/Fail): 676/1372 | B: 616/1432 | C: 459/1589 [LOSS Ex1] A: 0.64696 | B: 0.63353 | C: 0.63174 [LOGITS Ex2 A] Mean Abs: 2.041 | Max: 6.198 [LOSS Ex2] A: 0.12250 | B: 0.36110 | C: 0.27295 ** [JOINT LOSS] ** : 0.889593 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002716 | Grad Max: 0.079838 -> Layer: shared_layers.0.bias | Grad Mean: 0.191605 | Grad Max: 1.069549 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.005717 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006342 | Grad Max: 0.006342 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001285 | Grad Max: 0.307196 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023280 | Grad Max: 1.713290 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000120 | Grad Max: 0.004470 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008062 | Grad Max: 0.047424 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000251 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001632 | Grad Max: 0.004640 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000104 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000392 | Grad Max: 0.001277 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000304 | Grad Max: 0.001105 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005329 | Grad Max: 0.005329 [GRADIENT NORM TOTAL] 4.8120 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.642 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.543193 0.45680702] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.066 [MASKS] A(Pass/Fail): 673/1375 | B: 618/1430 | C: 456/1592 [LOSS Ex1] A: 0.64700 | B: 0.62922 | C: 0.62734 [LOGITS Ex2 A] Mean Abs: 2.035 | Max: 5.564 [LOSS Ex2] A: 0.13415 | B: 0.34160 | C: 0.24084 ** [JOINT LOSS] ** : 0.873384 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003194 | Grad Max: 0.105914 -> Layer: shared_layers.0.bias | Grad Mean: 0.193911 | Grad Max: 1.098228 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.005599 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000426 | Grad Max: 0.000426 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001272 | Grad Max: 0.288037 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022418 | Grad Max: 1.587873 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000119 | Grad Max: 0.007176 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007774 | Grad Max: 0.063793 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000190 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001492 | Grad Max: 0.004299 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000098 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000402 | Grad Max: 0.001214 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000415 | Grad Max: 0.001421 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008062 | Grad Max: 0.008062 [GRADIENT NORM TOTAL] 4.6026 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.808 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7445778 0.25542217] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.069 [MASKS] A(Pass/Fail): 729/1319 | B: 579/1277 | C: 344/1032 [LOSS Ex1] A: 0.64121 | B: 0.63348 | C: 0.62427 [LOGITS Ex2 A] Mean Abs: 2.049 | Max: 5.983 [LOSS Ex2] A: 0.12161 | B: 0.33995 | C: 0.24560 ** [JOINT LOSS] ** : 0.868710 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004463 | Grad Max: 0.182794 -> Layer: shared_layers.0.bias | Grad Mean: 0.222274 | Grad Max: 1.159581 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.006732 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000747 | Grad Max: 0.000747 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001593 | Grad Max: 0.238079 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028571 | Grad Max: 1.318134 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000215 | Grad Max: 0.008380 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014780 | Grad Max: 0.071591 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000383 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003213 | Grad Max: 0.008563 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000216 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000804 | Grad Max: 0.002431 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000605 | Grad Max: 0.001910 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013454 | Grad Max: 0.013454 [GRADIENT NORM TOTAL] 4.6020 [EPOCH SUMMARY] Train Loss: 0.8777 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8622 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8631 -> New: 0.8622) ############################## EPOCH 121/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.897 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008039 0.4991961] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.068 [MASKS] A(Pass/Fail): 711/1337 | B: 627/1421 | C: 456/1592 [LOSS Ex1] A: 0.64769 | B: 0.63263 | C: 0.62842 [LOGITS Ex2 A] Mean Abs: 2.089 | Max: 5.938 [LOSS Ex2] A: 0.12267 | B: 0.35958 | C: 0.25337 ** [JOINT LOSS] ** : 0.881454 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003515 | Grad Max: 0.152677 -> Layer: shared_layers.0.bias | Grad Mean: 0.077133 | Grad Max: 0.349109 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.005348 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001857 | Grad Max: 0.001857 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000820 | Grad Max: 0.172910 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012959 | Grad Max: 0.958847 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000067 | Grad Max: 0.004450 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002939 | Grad Max: 0.026045 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000151 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000348 | Grad Max: 0.002173 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000044 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000085 | Grad Max: 0.000410 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000300 | Grad Max: 0.000827 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000042 | Grad Max: 0.000042 [GRADIENT NORM TOTAL] 2.3633 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.622 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.692925 0.30707502] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.069 [MASKS] A(Pass/Fail): 697/1351 | B: 618/1430 | C: 508/1540 [LOSS Ex1] A: 0.64365 | B: 0.63333 | C: 0.62534 [LOGITS Ex2 A] Mean Abs: 2.070 | Max: 6.406 [LOSS Ex2] A: 0.13778 | B: 0.34885 | C: 0.25382 ** [JOINT LOSS] ** : 0.880926 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003686 | Grad Max: 0.114242 -> Layer: shared_layers.0.bias | Grad Mean: 0.091699 | Grad Max: 0.572788 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.006093 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000487 | Grad Max: 0.000487 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000766 | Grad Max: 0.181400 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012078 | Grad Max: 0.993890 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.003065 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002044 | Grad Max: 0.024008 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000147 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000289 | Grad Max: 0.002017 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000057 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000079 | Grad Max: 0.000516 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000284 | Grad Max: 0.000817 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000301 | Grad Max: 0.000301 [GRADIENT NORM TOTAL] 2.4761 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.723 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61823475 0.38176525] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.069 [MASKS] A(Pass/Fail): 586/1030 | B: 618/1430 | C: 495/1553 [LOSS Ex1] A: 0.64194 | B: 0.62901 | C: 0.63010 [LOGITS Ex2 A] Mean Abs: 2.125 | Max: 6.780 [LOSS Ex2] A: 0.12402 | B: 0.33173 | C: 0.28387 ** [JOINT LOSS] ** : 0.880225 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.048039 -> Layer: shared_layers.0.bias | Grad Mean: 0.127705 | Grad Max: 0.633375 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.006693 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010404 | Grad Max: 0.010404 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000710 | Grad Max: 0.493295 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012438 | Grad Max: 2.752285 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.003786 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002171 | Grad Max: 0.025876 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000132 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000371 | Grad Max: 0.002291 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000062 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000103 | Grad Max: 0.000497 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000248 | Grad Max: 0.000845 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003183 | Grad Max: 0.003183 [GRADIENT NORM TOTAL] 4.1310 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.899 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50739926 0.49260068] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.070 [MASKS] A(Pass/Fail): 709/1339 | B: 580/1276 | C: 510/1538 [LOSS Ex1] A: 0.64252 | B: 0.63328 | C: 0.62595 [LOGITS Ex2 A] Mean Abs: 2.097 | Max: 8.081 [LOSS Ex2] A: 0.12721 | B: 0.33251 | C: 0.24867 ** [JOINT LOSS] ** : 0.870046 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007179 | Grad Max: 0.291873 -> Layer: shared_layers.0.bias | Grad Mean: 0.135588 | Grad Max: 0.567032 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.005989 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003770 | Grad Max: 0.003770 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001194 | Grad Max: 0.281633 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017981 | Grad Max: 1.505011 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.004241 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003687 | Grad Max: 0.031781 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001029 | Grad Max: 0.003495 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000089 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000273 | Grad Max: 0.000908 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000401 | Grad Max: 0.001333 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004918 | Grad Max: 0.004918 [GRADIENT NORM TOTAL] 3.4334 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.847 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51037085 0.4896292 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.069 [MASKS] A(Pass/Fail): 697/1351 | B: 627/1421 | C: 482/1566 [LOSS Ex1] A: 0.63953 | B: 0.63242 | C: 0.62928 [LOGITS Ex2 A] Mean Abs: 2.098 | Max: 6.822 [LOSS Ex2] A: 0.12521 | B: 0.35849 | C: 0.23218 ** [JOINT LOSS] ** : 0.872370 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004891 | Grad Max: 0.178203 -> Layer: shared_layers.0.bias | Grad Mean: 0.105337 | Grad Max: 0.534317 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.006067 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002884 | Grad Max: 0.002884 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000957 | Grad Max: 0.370578 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015092 | Grad Max: 2.007571 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.003234 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002971 | Grad Max: 0.021518 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000164 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000771 | Grad Max: 0.003052 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000077 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000216 | Grad Max: 0.000846 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000318 | Grad Max: 0.001172 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004527 | Grad Max: 0.004527 [GRADIENT NORM TOTAL] 3.3896 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.874 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50440955 0.49559048] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.069 [MASKS] A(Pass/Fail): 701/1347 | B: 619/1429 | C: 493/1555 [LOSS Ex1] A: 0.63851 | B: 0.63312 | C: 0.62783 [LOGITS Ex2 A] Mean Abs: 2.071 | Max: 8.782 [LOSS Ex2] A: 0.13734 | B: 0.35894 | C: 0.25800 ** [JOINT LOSS] ** : 0.884582 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003604 | Grad Max: 0.142161 -> Layer: shared_layers.0.bias | Grad Mean: 0.172221 | Grad Max: 1.045993 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.006581 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000460 | Grad Max: 0.000460 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001326 | Grad Max: 0.383385 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022468 | Grad Max: 2.089798 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.005961 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007567 | Grad Max: 0.054869 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000198 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001300 | Grad Max: 0.003832 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000076 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000352 | Grad Max: 0.001050 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000339 | Grad Max: 0.001053 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008103 | Grad Max: 0.008103 [GRADIENT NORM TOTAL] 4.4648 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.724 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5023249 0.49767512] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.068 [MASKS] A(Pass/Fail): 676/1372 | B: 618/1430 | C: 459/1589 [LOSS Ex1] A: 0.64670 | B: 0.62880 | C: 0.63139 [LOGITS Ex2 A] Mean Abs: 2.047 | Max: 6.026 [LOSS Ex2] A: 0.11891 | B: 0.33641 | C: 0.26520 ** [JOINT LOSS] ** : 0.875803 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002457 | Grad Max: 0.086046 -> Layer: shared_layers.0.bias | Grad Mean: 0.087280 | Grad Max: 0.441032 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.005433 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002732 | Grad Max: 0.002732 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000711 | Grad Max: 0.176802 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012112 | Grad Max: 0.969255 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.003088 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001919 | Grad Max: 0.022003 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000167 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000342 | Grad Max: 0.002196 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000051 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000098 | Grad Max: 0.000553 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.000889 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001993 | Grad Max: 0.001993 [GRADIENT NORM TOTAL] 2.5814 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.116 | Max: 0.647 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5432552 0.45674482] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.066 [MASKS] A(Pass/Fail): 673/1375 | B: 580/1276 | C: 493/1555 [LOSS Ex1] A: 0.64675 | B: 0.63306 | C: 0.62621 [LOGITS Ex2 A] Mean Abs: 2.040 | Max: 6.443 [LOSS Ex2] A: 0.13101 | B: 0.33014 | C: 0.24129 ** [JOINT LOSS] ** : 0.869486 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005360 | Grad Max: 0.180258 -> Layer: shared_layers.0.bias | Grad Mean: 0.178665 | Grad Max: 0.949077 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005986 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002751 | Grad Max: 0.002751 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001113 | Grad Max: 0.310187 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018160 | Grad Max: 1.724575 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.005772 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004070 | Grad Max: 0.055985 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000153 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000550 | Grad Max: 0.002608 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000152 | Grad Max: 0.000639 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000354 | Grad Max: 0.000882 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003076 | Grad Max: 0.003076 [GRADIENT NORM TOTAL] 4.0079 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.814 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.74609053 0.2539095 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.069 [MASKS] A(Pass/Fail): 729/1319 | B: 628/1420 | C: 535/1513 [LOSS Ex1] A: 0.64094 | B: 0.63220 | C: 0.62126 [LOGITS Ex2 A] Mean Abs: 2.101 | Max: 6.481 [LOSS Ex2] A: 0.12183 | B: 0.35276 | C: 0.25216 ** [JOINT LOSS] ** : 0.873712 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003257 | Grad Max: 0.109684 -> Layer: shared_layers.0.bias | Grad Mean: 0.111002 | Grad Max: 0.641648 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002309 | Grad Max: 0.006114 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003537 | Grad Max: 0.003537 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000896 | Grad Max: 0.238026 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014975 | Grad Max: 1.334217 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.003361 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002381 | Grad Max: 0.020947 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000123 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000320 | Grad Max: 0.002122 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000059 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000083 | Grad Max: 0.000499 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000253 | Grad Max: 0.000853 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001378 | Grad Max: 0.001378 [GRADIENT NORM TOTAL] 3.0767 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.903 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008179 0.4991821] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.068 [MASKS] A(Pass/Fail): 711/1337 | B: 620/1428 | C: 487/1561 [LOSS Ex1] A: 0.64743 | B: 0.63289 | C: 0.62944 [LOGITS Ex2 A] Mean Abs: 2.088 | Max: 6.071 [LOSS Ex2] A: 0.10855 | B: 0.35026 | C: 0.26059 ** [JOINT LOSS] ** : 0.876389 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004235 | Grad Max: 0.175327 -> Layer: shared_layers.0.bias | Grad Mean: 0.105022 | Grad Max: 0.510246 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.005939 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008261 | Grad Max: 0.008261 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000955 | Grad Max: 0.103472 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016060 | Grad Max: 0.564838 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000104 | Grad Max: 0.004084 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006374 | Grad Max: 0.040665 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000236 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001516 | Grad Max: 0.004196 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000135 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000401 | Grad Max: 0.001489 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000460 | Grad Max: 0.001678 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008641 | Grad Max: 0.008641 [GRADIENT NORM TOTAL] 2.4671 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.627 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6939144 0.3060856] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.069 [MASKS] A(Pass/Fail): 698/1350 | B: 619/1429 | C: 508/1540 [LOSS Ex1] A: 0.64337 | B: 0.62857 | C: 0.62636 [LOGITS Ex2 A] Mean Abs: 2.108 | Max: 6.636 [LOSS Ex2] A: 0.13688 | B: 0.33619 | C: 0.23864 ** [JOINT LOSS] ** : 0.869999 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003850 | Grad Max: 0.104151 -> Layer: shared_layers.0.bias | Grad Mean: 0.253774 | Grad Max: 1.275019 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.006679 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003326 | Grad Max: 0.003326 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001810 | Grad Max: 0.294395 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032869 | Grad Max: 1.648871 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000231 | Grad Max: 0.008029 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015892 | Grad Max: 0.084083 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000346 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003337 | Grad Max: 0.007754 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000164 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000863 | Grad Max: 0.001980 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000696 | Grad Max: 0.001829 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015998 | Grad Max: 0.015998 [GRADIENT NORM TOTAL] 5.8204 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.729 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6188088 0.38119116] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.070 [MASKS] A(Pass/Fail): 586/1030 | B: 581/1275 | C: 495/1553 [LOSS Ex1] A: 0.64164 | B: 0.63283 | C: 0.62599 [LOGITS Ex2 A] Mean Abs: 2.139 | Max: 7.786 [LOSS Ex2] A: 0.13577 | B: 0.33007 | C: 0.26752 ** [JOINT LOSS] ** : 0.877941 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006751 | Grad Max: 0.233167 -> Layer: shared_layers.0.bias | Grad Mean: 0.226361 | Grad Max: 1.062162 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.006244 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007070 | Grad Max: 0.007070 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001336 | Grad Max: 0.451878 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021853 | Grad Max: 2.532056 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000080 | Grad Max: 0.004218 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002943 | Grad Max: 0.038898 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000163 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000323 | Grad Max: 0.002095 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000054 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000089 | Grad Max: 0.000430 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000205 | Grad Max: 0.000666 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001720 | Grad Max: 0.001720 [GRADIENT NORM TOTAL] 5.5118 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.905 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073731 0.4926269] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.070 [MASKS] A(Pass/Fail): 710/1338 | B: 628/1420 | C: 505/1543 [LOSS Ex1] A: 0.64222 | B: 0.63198 | C: 0.62436 [LOGITS Ex2 A] Mean Abs: 2.105 | Max: 6.986 [LOSS Ex2] A: 0.12624 | B: 0.36330 | C: 0.24343 ** [JOINT LOSS] ** : 0.877180 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007448 | Grad Max: 0.350605 -> Layer: shared_layers.0.bias | Grad Mean: 0.263550 | Grad Max: 1.436710 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.006376 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001408 | Grad Max: 0.001408 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001701 | Grad Max: 0.464097 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026800 | Grad Max: 2.592620 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.004808 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004256 | Grad Max: 0.040722 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000205 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000339 | Grad Max: 0.002594 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000054 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000747 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000249 | Grad Max: 0.000688 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000720 | Grad Max: 0.000720 [GRADIENT NORM TOTAL] 6.0786 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.854 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51040477 0.4895953 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.070 [MASKS] A(Pass/Fail): 700/1348 | B: 621/1427 | C: 338/1038 [LOSS Ex1] A: 0.63922 | B: 0.63268 | C: 0.62584 [LOGITS Ex2 A] Mean Abs: 2.105 | Max: 5.953 [LOSS Ex2] A: 0.12920 | B: 0.35551 | C: 0.24918 ** [JOINT LOSS] ** : 0.877207 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006247 | Grad Max: 0.208648 -> Layer: shared_layers.0.bias | Grad Mean: 0.143624 | Grad Max: 0.651515 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002270 | Grad Max: 0.006306 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003106 | Grad Max: 0.003106 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001097 | Grad Max: 0.205648 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017566 | Grad Max: 1.110768 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.004900 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002732 | Grad Max: 0.031145 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000202 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000560 | Grad Max: 0.002600 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000071 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000137 | Grad Max: 0.000677 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000222 | Grad Max: 0.000822 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001329 | Grad Max: 0.001329 [GRADIENT NORM TOTAL] 3.5470 [EPOCH SUMMARY] Train Loss: 0.8762 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8557 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8622 -> New: 0.8557) ############################## EPOCH 122/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.880 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044451 0.49555492] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.069 [MASKS] A(Pass/Fail): 707/1341 | B: 620/1428 | C: 476/1572 [LOSS Ex1] A: 0.63821 | B: 0.62835 | C: 0.62662 [LOGITS Ex2 A] Mean Abs: 2.093 | Max: 8.025 [LOSS Ex2] A: 0.14657 | B: 0.33962 | C: 0.25757 ** [JOINT LOSS] ** : 0.878973 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003577 | Grad Max: 0.155775 -> Layer: shared_layers.0.bias | Grad Mean: 0.091846 | Grad Max: 0.542717 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002352 | Grad Max: 0.006579 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005138 | Grad Max: 0.005138 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000815 | Grad Max: 0.168174 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012864 | Grad Max: 0.936435 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.003258 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002214 | Grad Max: 0.018235 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000141 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000452 | Grad Max: 0.002587 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000073 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000111 | Grad Max: 0.000712 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000201 | Grad Max: 0.000712 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001175 | Grad Max: 0.001175 [GRADIENT NORM TOTAL] 2.7660 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.730 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.502211 0.49778903] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.068 [MASKS] A(Pass/Fail): 677/1371 | B: 582/1274 | C: 504/1544 [LOSS Ex1] A: 0.64642 | B: 0.63260 | C: 0.62708 [LOGITS Ex2 A] Mean Abs: 2.054 | Max: 6.278 [LOSS Ex2] A: 0.12621 | B: 0.33034 | C: 0.23966 ** [JOINT LOSS] ** : 0.867443 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004948 | Grad Max: 0.152426 -> Layer: shared_layers.0.bias | Grad Mean: 0.190900 | Grad Max: 0.736228 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.006209 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009360 | Grad Max: 0.009360 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001386 | Grad Max: 0.159915 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024926 | Grad Max: 0.901235 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000220 | Grad Max: 0.008336 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014679 | Grad Max: 0.086289 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000349 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003057 | Grad Max: 0.006997 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000164 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000770 | Grad Max: 0.001890 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000678 | Grad Max: 0.001813 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014142 | Grad Max: 0.014142 [GRADIENT NORM TOTAL] 3.7501 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.652 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54346454 0.45653546] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.552 | Std: 0.066 [MASKS] A(Pass/Fail): 675/1373 | B: 628/1420 | C: 529/1519 [LOSS Ex1] A: 0.64648 | B: 0.63175 | C: 0.62398 [LOGITS Ex2 A] Mean Abs: 2.076 | Max: 6.143 [LOSS Ex2] A: 0.13277 | B: 0.35972 | C: 0.26620 ** [JOINT LOSS] ** : 0.886970 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006414 | Grad Max: 0.234569 -> Layer: shared_layers.0.bias | Grad Mean: 0.471798 | Grad Max: 2.751504 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.006000 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005606 | Grad Max: 0.005606 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003094 | Grad Max: 0.519786 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054949 | Grad Max: 2.914001 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000352 | Grad Max: 0.014151 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023932 | Grad Max: 0.134738 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000472 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004487 | Grad Max: 0.009997 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000193 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001117 | Grad Max: 0.002778 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000769 | Grad Max: 0.001631 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019069 | Grad Max: 0.019069 [GRADIENT NORM TOTAL] 10.4999 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.821 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.74784565 0.25215435] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.070 [MASKS] A(Pass/Fail): 734/1314 | B: 622/1426 | C: 481/1567 [LOSS Ex1] A: 0.64063 | B: 0.63244 | C: 0.62822 [LOGITS Ex2 A] Mean Abs: 2.144 | Max: 6.435 [LOSS Ex2] A: 0.12343 | B: 0.35946 | C: 0.26124 ** [JOINT LOSS] ** : 0.881806 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006980 | Grad Max: 0.287712 -> Layer: shared_layers.0.bias | Grad Mean: 0.652681 | Grad Max: 3.664814 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.005870 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002004 | Grad Max: 0.002004 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004197 | Grad Max: 0.758110 -> Layer: exit2_layers.0.bias | Grad Mean: 0.076140 | Grad Max: 4.233461 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000528 | Grad Max: 0.021259 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036708 | Grad Max: 0.217007 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000666 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007141 | Grad Max: 0.015364 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000273 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001818 | Grad Max: 0.004113 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001276 | Grad Max: 0.002622 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032126 | Grad Max: 0.032126 [GRADIENT NORM TOTAL] 14.7990 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.910 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50087845 0.49912155] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.069 [MASKS] A(Pass/Fail): 714/1334 | B: 620/1428 | C: 526/1522 [LOSS Ex1] A: 0.64716 | B: 0.62812 | C: 0.62432 [LOGITS Ex2 A] Mean Abs: 2.128 | Max: 6.510 [LOSS Ex2] A: 0.12999 | B: 0.33592 | C: 0.25602 ** [JOINT LOSS] ** : 0.873841 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008748 | Grad Max: 0.345622 -> Layer: shared_layers.0.bias | Grad Mean: 0.306400 | Grad Max: 1.591846 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.005844 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002576 | Grad Max: 0.002576 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002367 | Grad Max: 0.339146 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039468 | Grad Max: 1.822804 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000234 | Grad Max: 0.010036 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013881 | Grad Max: 0.106129 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000288 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002247 | Grad Max: 0.005549 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000094 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000570 | Grad Max: 0.001540 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000351 | Grad Max: 0.001138 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010209 | Grad Max: 0.010209 [GRADIENT NORM TOTAL] 6.8339 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.632 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.69510126 0.30489874] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.070 [MASKS] A(Pass/Fail): 699/1349 | B: 582/1274 | C: 465/1583 [LOSS Ex1] A: 0.64308 | B: 0.63239 | C: 0.63131 [LOGITS Ex2 A] Mean Abs: 2.068 | Max: 6.274 [LOSS Ex2] A: 0.15943 | B: 0.34324 | C: 0.28592 ** [JOINT LOSS] ** : 0.898462 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012530 | Grad Max: 0.406141 -> Layer: shared_layers.0.bias | Grad Mean: 0.409739 | Grad Max: 1.587740 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.006072 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002394 | Grad Max: 0.002394 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002847 | Grad Max: 0.372091 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050587 | Grad Max: 1.769511 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000375 | Grad Max: 0.009869 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024464 | Grad Max: 0.118294 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000519 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005391 | Grad Max: 0.010918 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000264 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001380 | Grad Max: 0.003400 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001216 | Grad Max: 0.002335 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025481 | Grad Max: 0.025481 [GRADIENT NORM TOTAL] 8.1052 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.734 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6195589 0.3804411] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.070 [MASKS] A(Pass/Fail): 587/1029 | B: 628/1420 | C: 499/1549 [LOSS Ex1] A: 0.64137 | B: 0.63156 | C: 0.62751 [LOGITS Ex2 A] Mean Abs: 2.100 | Max: 5.786 [LOSS Ex2] A: 0.13632 | B: 0.36087 | C: 0.23489 ** [JOINT LOSS] ** : 0.877505 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009231 | Grad Max: 0.266429 -> Layer: shared_layers.0.bias | Grad Mean: 0.569539 | Grad Max: 2.728072 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.005943 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007341 | Grad Max: 0.007341 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003701 | Grad Max: 0.757311 -> Layer: exit2_layers.0.bias | Grad Mean: 0.067626 | Grad Max: 4.220363 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.016199 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033013 | Grad Max: 0.162735 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000695 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007010 | Grad Max: 0.015206 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000309 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001792 | Grad Max: 0.004012 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001463 | Grad Max: 0.002907 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032551 | Grad Max: 0.032551 [GRADIENT NORM TOTAL] 12.5894 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.912 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50735277 0.49264726] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.070 [MASKS] A(Pass/Fail): 710/1338 | B: 622/1426 | C: 486/1562 [LOSS Ex1] A: 0.64195 | B: 0.63226 | C: 0.62844 [LOGITS Ex2 A] Mean Abs: 2.102 | Max: 6.230 [LOSS Ex2] A: 0.12411 | B: 0.35712 | C: 0.25642 ** [JOINT LOSS] ** : 0.880101 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003183 | Grad Max: 0.142045 -> Layer: shared_layers.0.bias | Grad Mean: 0.194765 | Grad Max: 1.193328 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.005724 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002833 | Grad Max: 0.002833 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001049 | Grad Max: 0.480596 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018508 | Grad Max: 2.684771 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.003823 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004118 | Grad Max: 0.035289 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000162 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000875 | Grad Max: 0.003126 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000084 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000225 | Grad Max: 0.000765 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000286 | Grad Max: 0.000992 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005127 | Grad Max: 0.005127 [GRADIENT NORM TOTAL] 5.2780 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.860 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104603 0.4895397] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.070 [MASKS] A(Pass/Fail): 701/1347 | B: 620/1428 | C: 545/1503 [LOSS Ex1] A: 0.63894 | B: 0.62794 | C: 0.62328 [LOGITS Ex2 A] Mean Abs: 2.118 | Max: 5.257 [LOSS Ex2] A: 0.13577 | B: 0.33392 | C: 0.25485 ** [JOINT LOSS] ** : 0.871568 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009009 | Grad Max: 0.260619 -> Layer: shared_layers.0.bias | Grad Mean: 0.442500 | Grad Max: 1.785108 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002365 | Grad Max: 0.006204 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003951 | Grad Max: 0.003951 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002966 | Grad Max: 0.359177 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054570 | Grad Max: 2.025843 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.012322 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028252 | Grad Max: 0.140751 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000509 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005953 | Grad Max: 0.012398 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000260 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001523 | Grad Max: 0.003578 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001252 | Grad Max: 0.002604 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027949 | Grad Max: 0.027949 [GRADIENT NORM TOTAL] 8.9807 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.886 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044989 0.49550113] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.070 [MASKS] A(Pass/Fail): 707/1341 | B: 583/1273 | C: 517/1531 [LOSS Ex1] A: 0.63794 | B: 0.63223 | C: 0.62633 [LOGITS Ex2 A] Mean Abs: 2.115 | Max: 7.086 [LOSS Ex2] A: 0.16548 | B: 0.32726 | C: 0.25586 ** [JOINT LOSS] ** : 0.881702 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012144 | Grad Max: 0.451277 -> Layer: shared_layers.0.bias | Grad Mean: 0.427327 | Grad Max: 1.566839 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002275 | Grad Max: 0.006461 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004446 | Grad Max: 0.004446 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003157 | Grad Max: 0.421112 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056340 | Grad Max: 2.318303 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000434 | Grad Max: 0.012662 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028823 | Grad Max: 0.145800 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000631 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006375 | Grad Max: 0.012688 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000282 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001657 | Grad Max: 0.003828 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001400 | Grad Max: 0.002724 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030502 | Grad Max: 0.030502 [GRADIENT NORM TOTAL] 8.8494 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.734 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5021688 0.49783123] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.553 | Std: 0.069 [MASKS] A(Pass/Fail): 677/1371 | B: 628/1420 | C: 510/1538 [LOSS Ex1] A: 0.64618 | B: 0.63140 | C: 0.62644 [LOGITS Ex2 A] Mean Abs: 2.044 | Max: 5.863 [LOSS Ex2] A: 0.12408 | B: 0.36399 | C: 0.24446 ** [JOINT LOSS] ** : 0.878850 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005145 | Grad Max: 0.147396 -> Layer: shared_layers.0.bias | Grad Mean: 0.316003 | Grad Max: 1.451213 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006787 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011833 | Grad Max: 0.011833 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002060 | Grad Max: 0.354299 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036451 | Grad Max: 1.817282 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000242 | Grad Max: 0.010677 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016447 | Grad Max: 0.101143 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000305 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003111 | Grad Max: 0.007228 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000152 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000781 | Grad Max: 0.002156 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000576 | Grad Max: 0.001474 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014082 | Grad Max: 0.014082 [GRADIENT NORM TOTAL] 6.7071 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.656 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54345006 0.45654994] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.067 [MASKS] A(Pass/Fail): 676/1372 | B: 622/1426 | C: 506/1542 [LOSS Ex1] A: 0.64627 | B: 0.63210 | C: 0.62381 [LOGITS Ex2 A] Mean Abs: 2.015 | Max: 5.694 [LOSS Ex2] A: 0.12089 | B: 0.35878 | C: 0.24198 ** [JOINT LOSS] ** : 0.874610 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003822 | Grad Max: 0.116021 -> Layer: shared_layers.0.bias | Grad Mean: 0.355347 | Grad Max: 1.603620 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.006161 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007425 | Grad Max: 0.007425 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002307 | Grad Max: 0.297535 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041942 | Grad Max: 1.610927 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000306 | Grad Max: 0.013593 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021396 | Grad Max: 0.142835 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000428 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004153 | Grad Max: 0.009685 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000195 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001053 | Grad Max: 0.002793 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000778 | Grad Max: 0.001969 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019200 | Grad Max: 0.019200 [GRADIENT NORM TOTAL] 7.4537 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.826 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.74913806 0.2508619 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.070 [MASKS] A(Pass/Fail): 734/1314 | B: 620/1428 | C: 490/1558 [LOSS Ex1] A: 0.64040 | B: 0.62779 | C: 0.62480 [LOGITS Ex2 A] Mean Abs: 2.056 | Max: 6.427 [LOSS Ex2] A: 0.13189 | B: 0.32827 | C: 0.22165 ** [JOINT LOSS] ** : 0.858267 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004995 | Grad Max: 0.177715 -> Layer: shared_layers.0.bias | Grad Mean: 0.141133 | Grad Max: 0.620464 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002274 | Grad Max: 0.006497 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003084 | Grad Max: 0.003084 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001021 | Grad Max: 0.233575 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017479 | Grad Max: 1.293205 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000101 | Grad Max: 0.004072 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006277 | Grad Max: 0.032463 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000271 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001482 | Grad Max: 0.003838 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000101 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000389 | Grad Max: 0.001288 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000485 | Grad Max: 0.001635 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007227 | Grad Max: 0.007227 [GRADIENT NORM TOTAL] 3.5032 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.916 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50088435 0.49911568] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.069 [MASKS] A(Pass/Fail): 714/1334 | B: 585/1271 | C: 337/1039 [LOSS Ex1] A: 0.64695 | B: 0.63208 | C: 0.62610 [LOGITS Ex2 A] Mean Abs: 2.110 | Max: 5.622 [LOSS Ex2] A: 0.11535 | B: 0.33046 | C: 0.24643 ** [JOINT LOSS] ** : 0.865785 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002224 | Grad Max: 0.064773 -> Layer: shared_layers.0.bias | Grad Mean: 0.082621 | Grad Max: 0.362353 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.005954 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002376 | Grad Max: 0.002376 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000757 | Grad Max: 0.177102 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012734 | Grad Max: 0.985454 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.004027 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002769 | Grad Max: 0.039263 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000129 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000425 | Grad Max: 0.002458 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000063 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000112 | Grad Max: 0.000633 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000396 | Grad Max: 0.001002 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001031 | Grad Max: 0.001031 [GRADIENT NORM TOTAL] 2.6441 [EPOCH SUMMARY] Train Loss: 0.8768 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8560 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 123/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.636 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6959383 0.3040617] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.070 [MASKS] A(Pass/Fail): 699/1349 | B: 630/1418 | C: 465/1583 [LOSS Ex1] A: 0.64286 | B: 0.63124 | C: 0.63426 [LOGITS Ex2 A] Mean Abs: 2.066 | Max: 5.971 [LOSS Ex2] A: 0.13945 | B: 0.34966 | C: 0.25886 ** [JOINT LOSS] ** : 0.885447 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005402 | Grad Max: 0.181821 -> Layer: shared_layers.0.bias | Grad Mean: 0.133374 | Grad Max: 0.546803 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002115 | Grad Max: 0.005768 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003018 | Grad Max: 0.003018 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001040 | Grad Max: 0.276625 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017308 | Grad Max: 1.557226 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.003898 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004622 | Grad Max: 0.028213 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000234 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001142 | Grad Max: 0.003711 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000094 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000280 | Grad Max: 0.000875 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000299 | Grad Max: 0.001030 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004670 | Grad Max: 0.004670 [GRADIENT NORM TOTAL] 3.5881 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.738 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.61995983 0.38004014] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.070 [MASKS] A(Pass/Fail): 587/1029 | B: 623/1425 | C: 494/1554 [LOSS Ex1] A: 0.64114 | B: 0.63194 | C: 0.62995 [LOGITS Ex2 A] Mean Abs: 2.105 | Max: 7.198 [LOSS Ex2] A: 0.13473 | B: 0.35121 | C: 0.27584 ** [JOINT LOSS] ** : 0.888274 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004082 | Grad Max: 0.156711 -> Layer: shared_layers.0.bias | Grad Mean: 0.141589 | Grad Max: 0.676188 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002107 | Grad Max: 0.005873 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007306 | Grad Max: 0.007306 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001135 | Grad Max: 0.175157 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019213 | Grad Max: 0.921149 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000071 | Grad Max: 0.004276 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003465 | Grad Max: 0.040708 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000135 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000489 | Grad Max: 0.002604 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000060 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000114 | Grad Max: 0.000660 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000175 | Grad Max: 0.000758 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000570 | Grad Max: 0.000570 [GRADIENT NORM TOTAL] 3.4806 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.917 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50732154 0.49267846] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.071 [MASKS] A(Pass/Fail): 710/1338 | B: 620/1428 | C: 511/1537 [LOSS Ex1] A: 0.64172 | B: 0.62762 | C: 0.62433 [LOGITS Ex2 A] Mean Abs: 2.105 | Max: 7.395 [LOSS Ex2] A: 0.12090 | B: 0.33169 | C: 0.25955 ** [JOINT LOSS] ** : 0.868601 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.069470 -> Layer: shared_layers.0.bias | Grad Mean: 0.088703 | Grad Max: 0.449160 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002273 | Grad Max: 0.005729 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003009 | Grad Max: 0.003009 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000736 | Grad Max: 0.175656 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012853 | Grad Max: 0.980993 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.003241 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002506 | Grad Max: 0.020880 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000129 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000387 | Grad Max: 0.002705 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000054 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000099 | Grad Max: 0.000610 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000334 | Grad Max: 0.000917 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000648 | Grad Max: 0.000648 [GRADIENT NORM TOTAL] 2.6399 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.864 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104711 0.48952892] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.070 [MASKS] A(Pass/Fail): 701/1347 | B: 585/1271 | C: 487/1561 [LOSS Ex1] A: 0.63870 | B: 0.63190 | C: 0.62747 [LOGITS Ex2 A] Mean Abs: 2.082 | Max: 5.476 [LOSS Ex2] A: 0.12882 | B: 0.33535 | C: 0.23720 ** [JOINT LOSS] ** : 0.866478 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003427 | Grad Max: 0.107804 -> Layer: shared_layers.0.bias | Grad Mean: 0.291572 | Grad Max: 1.520487 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.006023 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000911 | Grad Max: 0.000911 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001770 | Grad Max: 0.447997 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031559 | Grad Max: 2.488886 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000195 | Grad Max: 0.008757 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013626 | Grad Max: 0.097625 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000308 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002666 | Grad Max: 0.006349 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000140 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000677 | Grad Max: 0.001764 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000480 | Grad Max: 0.001560 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011816 | Grad Max: 0.011816 [GRADIENT NORM TOTAL] 6.4368 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.892 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5045406 0.49545938] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.070 [MASKS] A(Pass/Fail): 707/1341 | B: 630/1418 | C: 530/1518 [LOSS Ex1] A: 0.63769 | B: 0.63107 | C: 0.62439 [LOGITS Ex2 A] Mean Abs: 2.064 | Max: 7.341 [LOSS Ex2] A: 0.14402 | B: 0.35644 | C: 0.24531 ** [JOINT LOSS] ** : 0.879644 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005917 | Grad Max: 0.265172 -> Layer: shared_layers.0.bias | Grad Mean: 0.104220 | Grad Max: 0.385255 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002228 | Grad Max: 0.006668 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001808 | Grad Max: 0.001808 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001016 | Grad Max: 0.165561 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015378 | Grad Max: 0.913546 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000083 | Grad Max: 0.004294 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004067 | Grad Max: 0.027065 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000214 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001040 | Grad Max: 0.003549 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000265 | Grad Max: 0.000944 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000253 | Grad Max: 0.001050 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004307 | Grad Max: 0.004307 [GRADIENT NORM TOTAL] 2.8222 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.738 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5021162 0.4978838] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.069 [MASKS] A(Pass/Fail): 677/1371 | B: 623/1425 | C: 527/1521 [LOSS Ex1] A: 0.64595 | B: 0.63176 | C: 0.62183 [LOGITS Ex2 A] Mean Abs: 2.074 | Max: 5.849 [LOSS Ex2] A: 0.12167 | B: 0.35133 | C: 0.22469 ** [JOINT LOSS] ** : 0.865746 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002758 | Grad Max: 0.078278 -> Layer: shared_layers.0.bias | Grad Mean: 0.125310 | Grad Max: 0.518688 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002142 | Grad Max: 0.005922 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008006 | Grad Max: 0.008006 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000904 | Grad Max: 0.303575 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015348 | Grad Max: 1.713656 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.006260 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004381 | Grad Max: 0.048521 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000216 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000714 | Grad Max: 0.002718 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000077 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000174 | Grad Max: 0.000983 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000427 | Grad Max: 0.001116 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002344 | Grad Max: 0.002344 [GRADIENT NORM TOTAL] 3.4490 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.117 | Max: 0.661 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54355335 0.45644668] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.067 [MASKS] A(Pass/Fail): 677/1371 | B: 620/1428 | C: 526/1522 [LOSS Ex1] A: 0.64604 | B: 0.62743 | C: 0.62529 [LOGITS Ex2 A] Mean Abs: 2.064 | Max: 6.010 [LOSS Ex2] A: 0.13017 | B: 0.33586 | C: 0.24973 ** [JOINT LOSS] ** : 0.871506 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003578 | Grad Max: 0.145065 -> Layer: shared_layers.0.bias | Grad Mean: 0.303104 | Grad Max: 1.743588 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.005428 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003586 | Grad Max: 0.003586 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001956 | Grad Max: 0.319754 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035396 | Grad Max: 1.742353 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000229 | Grad Max: 0.008976 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016008 | Grad Max: 0.106424 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000274 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003176 | Grad Max: 0.007117 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000155 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000809 | Grad Max: 0.001929 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000550 | Grad Max: 0.001705 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013962 | Grad Max: 0.013962 [GRADIENT NORM TOTAL] 6.6944 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.831 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7505793 0.24942073] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.071 [MASKS] A(Pass/Fail): 734/1314 | B: 586/1270 | C: 521/1527 [LOSS Ex1] A: 0.64014 | B: 0.63172 | C: 0.62214 [LOGITS Ex2 A] Mean Abs: 2.095 | Max: 6.538 [LOSS Ex2] A: 0.11337 | B: 0.33081 | C: 0.24392 ** [JOINT LOSS] ** : 0.860701 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003211 | Grad Max: 0.104264 -> Layer: shared_layers.0.bias | Grad Mean: 0.090113 | Grad Max: 0.403602 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002357 | Grad Max: 0.006763 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010784 | Grad Max: 0.010784 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000759 | Grad Max: 0.184355 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012700 | Grad Max: 1.010200 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.002847 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002693 | Grad Max: 0.022632 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000161 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000569 | Grad Max: 0.002811 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000072 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000134 | Grad Max: 0.000853 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000381 | Grad Max: 0.001100 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001337 | Grad Max: 0.001337 [GRADIENT NORM TOTAL] 2.5630 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.921 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50091475 0.49908522] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.069 [MASKS] A(Pass/Fail): 715/1333 | B: 631/1417 | C: 500/1548 [LOSS Ex1] A: 0.64670 | B: 0.63088 | C: 0.62679 [LOGITS Ex2 A] Mean Abs: 2.110 | Max: 6.188 [LOSS Ex2] A: 0.11217 | B: 0.35604 | C: 0.26026 ** [JOINT LOSS] ** : 0.877617 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005743 | Grad Max: 0.180796 -> Layer: shared_layers.0.bias | Grad Mean: 0.245344 | Grad Max: 1.093278 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.006043 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003184 | Grad Max: 0.003184 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001605 | Grad Max: 0.441398 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028263 | Grad Max: 2.489155 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000191 | Grad Max: 0.007240 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012888 | Grad Max: 0.072674 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000337 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002812 | Grad Max: 0.006393 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000146 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000722 | Grad Max: 0.001663 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000588 | Grad Max: 0.001490 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013246 | Grad Max: 0.013246 [GRADIENT NORM TOTAL] 5.5423 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.641 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.696925 0.303075] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.070 [MASKS] A(Pass/Fail): 702/1346 | B: 624/1424 | C: 493/1555 [LOSS Ex1] A: 0.64261 | B: 0.63157 | C: 0.62542 [LOGITS Ex2 A] Mean Abs: 2.111 | Max: 5.845 [LOSS Ex2] A: 0.13268 | B: 0.34457 | C: 0.25610 ** [JOINT LOSS] ** : 0.877644 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003445 | Grad Max: 0.092421 -> Layer: shared_layers.0.bias | Grad Mean: 0.176393 | Grad Max: 0.861026 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.006297 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010013 | Grad Max: 0.010013 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001328 | Grad Max: 0.309706 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023457 | Grad Max: 1.728211 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000114 | Grad Max: 0.007119 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007781 | Grad Max: 0.073615 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000192 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001507 | Grad Max: 0.004217 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000093 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000379 | Grad Max: 0.001108 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000290 | Grad Max: 0.000998 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006003 | Grad Max: 0.006003 [GRADIENT NORM TOTAL] 4.7047 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.744 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62046486 0.3795351 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.071 [MASKS] A(Pass/Fail): 586/1030 | B: 620/1428 | C: 531/1517 [LOSS Ex1] A: 0.64089 | B: 0.62723 | C: 0.62368 [LOGITS Ex2 A] Mean Abs: 2.155 | Max: 7.466 [LOSS Ex2] A: 0.12348 | B: 0.33666 | C: 0.25793 ** [JOINT LOSS] ** : 0.869956 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002449 | Grad Max: 0.072513 -> Layer: shared_layers.0.bias | Grad Mean: 0.171752 | Grad Max: 0.952079 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002240 | Grad Max: 0.006442 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008567 | Grad Max: 0.008567 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001339 | Grad Max: 0.276032 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024022 | Grad Max: 1.543805 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000154 | Grad Max: 0.005688 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010409 | Grad Max: 0.056943 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000216 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002077 | Grad Max: 0.005304 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000116 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000528 | Grad Max: 0.001423 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000367 | Grad Max: 0.001248 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008775 | Grad Max: 0.008775 [GRADIENT NORM TOTAL] 4.4295 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.923 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072711 0.49272895] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.071 [MASKS] A(Pass/Fail): 710/1338 | B: 586/1270 | C: 499/1549 [LOSS Ex1] A: 0.64146 | B: 0.63152 | C: 0.62438 [LOGITS Ex2 A] Mean Abs: 2.114 | Max: 6.614 [LOSS Ex2] A: 0.12049 | B: 0.34083 | C: 0.23087 ** [JOINT LOSS] ** : 0.863184 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003243 | Grad Max: 0.109716 -> Layer: shared_layers.0.bias | Grad Mean: 0.255179 | Grad Max: 1.173665 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.006087 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002790 | Grad Max: 0.002790 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001601 | Grad Max: 0.188105 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029233 | Grad Max: 1.050740 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000235 | Grad Max: 0.010227 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016470 | Grad Max: 0.099716 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000332 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003263 | Grad Max: 0.007748 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000162 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000808 | Grad Max: 0.002340 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000598 | Grad Max: 0.001701 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013893 | Grad Max: 0.013893 [GRADIENT NORM TOTAL] 5.1480 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.870 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51043415 0.48956582] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.071 [MASKS] A(Pass/Fail): 704/1344 | B: 632/1416 | C: 501/1547 [LOSS Ex1] A: 0.63843 | B: 0.63068 | C: 0.62776 [LOGITS Ex2 A] Mean Abs: 2.146 | Max: 6.100 [LOSS Ex2] A: 0.12674 | B: 0.34839 | C: 0.24973 ** [JOINT LOSS] ** : 0.873912 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003706 | Grad Max: 0.110021 -> Layer: shared_layers.0.bias | Grad Mean: 0.110637 | Grad Max: 0.509463 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.006322 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003786 | Grad Max: 0.003786 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000951 | Grad Max: 0.126364 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015888 | Grad Max: 0.700898 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.003651 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003382 | Grad Max: 0.035783 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000128 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000480 | Grad Max: 0.002120 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000055 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000137 | Grad Max: 0.000644 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000224 | Grad Max: 0.000754 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002901 | Grad Max: 0.002901 [GRADIENT NORM TOTAL] 2.7621 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.898 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5045722 0.49542782] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.070 [MASKS] A(Pass/Fail): 708/1340 | B: 624/1424 | C: 358/1018 [LOSS Ex1] A: 0.63743 | B: 0.63137 | C: 0.62064 [LOGITS Ex2 A] Mean Abs: 2.119 | Max: 7.051 [LOSS Ex2] A: 0.15301 | B: 0.35151 | C: 0.25354 ** [JOINT LOSS] ** : 0.882497 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007757 | Grad Max: 0.294427 -> Layer: shared_layers.0.bias | Grad Mean: 0.372485 | Grad Max: 1.545357 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002332 | Grad Max: 0.006461 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002594 | Grad Max: 0.002594 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002587 | Grad Max: 0.423823 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046712 | Grad Max: 2.370435 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000316 | Grad Max: 0.010069 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021422 | Grad Max: 0.101515 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000500 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004616 | Grad Max: 0.010002 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000200 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001157 | Grad Max: 0.002696 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000863 | Grad Max: 0.002045 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019407 | Grad Max: 0.019407 [GRADIENT NORM TOTAL] 8.2167 [EPOCH SUMMARY] Train Loss: 0.8737 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8533 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8557 -> New: 0.8533) ############################## EPOCH 124/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.743 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5020353 0.49796462] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.069 [MASKS] A(Pass/Fail): 677/1371 | B: 620/1428 | C: 521/1527 [LOSS Ex1] A: 0.64571 | B: 0.62703 | C: 0.62260 [LOGITS Ex2 A] Mean Abs: 2.109 | Max: 5.761 [LOSS Ex2] A: 0.11896 | B: 0.33304 | C: 0.24897 ** [JOINT LOSS] ** : 0.865433 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003213 | Grad Max: 0.095384 -> Layer: shared_layers.0.bias | Grad Mean: 0.197964 | Grad Max: 1.129166 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002258 | Grad Max: 0.006498 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009930 | Grad Max: 0.009930 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001268 | Grad Max: 0.269037 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023171 | Grad Max: 1.505456 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000131 | Grad Max: 0.007032 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009145 | Grad Max: 0.060640 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000238 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001919 | Grad Max: 0.004792 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000107 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000478 | Grad Max: 0.001268 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000369 | Grad Max: 0.001255 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007796 | Grad Max: 0.007796 [GRADIENT NORM TOTAL] 4.6896 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.666 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5436094 0.4563906] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.067 [MASKS] A(Pass/Fail): 677/1371 | B: 586/1270 | C: 503/1545 [LOSS Ex1] A: 0.64581 | B: 0.63132 | C: 0.62480 [LOGITS Ex2 A] Mean Abs: 2.048 | Max: 6.740 [LOSS Ex2] A: 0.13380 | B: 0.33615 | C: 0.22927 ** [JOINT LOSS] ** : 0.867049 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005318 | Grad Max: 0.143759 -> Layer: shared_layers.0.bias | Grad Mean: 0.328283 | Grad Max: 1.495407 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.005987 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003032 | Grad Max: 0.003032 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.413912 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040608 | Grad Max: 2.305137 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.009208 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020264 | Grad Max: 0.097102 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000379 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004194 | Grad Max: 0.008922 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000201 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001053 | Grad Max: 0.002596 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000793 | Grad Max: 0.002162 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018195 | Grad Max: 0.018195 [GRADIENT NORM TOTAL] 7.2169 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.835 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75210744 0.24789259] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.071 [MASKS] A(Pass/Fail): 739/1309 | B: 632/1416 | C: 525/1523 [LOSS Ex1] A: 0.63989 | B: 0.63049 | C: 0.62023 [LOGITS Ex2 A] Mean Abs: 2.090 | Max: 6.441 [LOSS Ex2] A: 0.11883 | B: 0.35881 | C: 0.23410 ** [JOINT LOSS] ** : 0.867450 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003261 | Grad Max: 0.103649 -> Layer: shared_layers.0.bias | Grad Mean: 0.260358 | Grad Max: 1.560939 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.006524 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002123 | Grad Max: 0.002123 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001783 | Grad Max: 0.417564 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032486 | Grad Max: 2.346198 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000192 | Grad Max: 0.007225 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013451 | Grad Max: 0.069990 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000274 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002803 | Grad Max: 0.006733 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000149 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000698 | Grad Max: 0.001979 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000529 | Grad Max: 0.001484 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011993 | Grad Max: 0.011993 [GRADIENT NORM TOTAL] 6.4461 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.926 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500956 0.49904406] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.070 [MASKS] A(Pass/Fail): 715/1333 | B: 624/1424 | C: 503/1545 [LOSS Ex1] A: 0.64647 | B: 0.63118 | C: 0.63045 [LOGITS Ex2 A] Mean Abs: 2.124 | Max: 5.785 [LOSS Ex2] A: 0.11241 | B: 0.34847 | C: 0.25057 ** [JOINT LOSS] ** : 0.873179 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003798 | Grad Max: 0.122908 -> Layer: shared_layers.0.bias | Grad Mean: 0.213063 | Grad Max: 0.898049 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.005835 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000682 | Grad Max: 0.000682 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001671 | Grad Max: 0.198822 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029405 | Grad Max: 1.089567 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000186 | Grad Max: 0.011130 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012720 | Grad Max: 0.108514 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000234 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002343 | Grad Max: 0.005577 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000128 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000589 | Grad Max: 0.001702 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000368 | Grad Max: 0.001301 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009635 | Grad Max: 0.009635 [GRADIENT NORM TOTAL] 4.9596 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.646 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6978943 0.3021057] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.071 [MASKS] A(Pass/Fail): 705/1343 | B: 620/1428 | C: 568/1480 [LOSS Ex1] A: 0.64236 | B: 0.62684 | C: 0.62144 [LOGITS Ex2 A] Mean Abs: 2.117 | Max: 6.897 [LOSS Ex2] A: 0.13565 | B: 0.32806 | C: 0.25786 ** [JOINT LOSS] ** : 0.870735 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004327 | Grad Max: 0.149064 -> Layer: shared_layers.0.bias | Grad Mean: 0.239451 | Grad Max: 1.395006 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002257 | Grad Max: 0.005899 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003830 | Grad Max: 0.003830 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001667 | Grad Max: 0.285795 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029019 | Grad Max: 1.593446 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000173 | Grad Max: 0.009446 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011352 | Grad Max: 0.079326 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000232 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001998 | Grad Max: 0.005125 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000095 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000499 | Grad Max: 0.001289 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000316 | Grad Max: 0.001254 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008492 | Grad Max: 0.008492 [GRADIENT NORM TOTAL] 5.3317 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.749 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6209914 0.37900853] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.071 [MASKS] A(Pass/Fail): 587/1029 | B: 586/1270 | C: 488/1560 [LOSS Ex1] A: 0.64064 | B: 0.63113 | C: 0.62888 [LOGITS Ex2 A] Mean Abs: 2.129 | Max: 8.132 [LOSS Ex2] A: 0.12289 | B: 0.33209 | C: 0.26186 ** [JOINT LOSS] ** : 0.872498 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004748 | Grad Max: 0.140141 -> Layer: shared_layers.0.bias | Grad Mean: 0.278821 | Grad Max: 1.013433 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002145 | Grad Max: 0.005547 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004033 | Grad Max: 0.004033 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001795 | Grad Max: 0.237487 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032947 | Grad Max: 1.254378 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000233 | Grad Max: 0.010154 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016402 | Grad Max: 0.113966 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000328 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003503 | Grad Max: 0.007233 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000177 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000916 | Grad Max: 0.002353 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000792 | Grad Max: 0.001767 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017640 | Grad Max: 0.017640 [GRADIENT NORM TOTAL] 5.6574 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.928 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50728065 0.49271935] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.071 [MASKS] A(Pass/Fail): 712/1336 | B: 633/1415 | C: 501/1547 [LOSS Ex1] A: 0.64121 | B: 0.63031 | C: 0.62798 [LOGITS Ex2 A] Mean Abs: 2.140 | Max: 7.668 [LOSS Ex2] A: 0.10833 | B: 0.35221 | C: 0.24264 ** [JOINT LOSS] ** : 0.867563 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003413 | Grad Max: 0.114459 -> Layer: shared_layers.0.bias | Grad Mean: 0.218998 | Grad Max: 1.019021 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006198 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001432 | Grad Max: 0.001432 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001407 | Grad Max: 0.552259 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024828 | Grad Max: 3.082441 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.006743 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009451 | Grad Max: 0.061944 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000219 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001783 | Grad Max: 0.004864 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000079 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000472 | Grad Max: 0.001096 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000419 | Grad Max: 0.001320 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009370 | Grad Max: 0.009370 [GRADIENT NORM TOTAL] 5.7347 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.876 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104191 0.4895809] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.071 [MASKS] A(Pass/Fail): 705/1343 | B: 625/1423 | C: 521/1527 [LOSS Ex1] A: 0.63817 | B: 0.63100 | C: 0.62559 [LOGITS Ex2 A] Mean Abs: 2.145 | Max: 5.701 [LOSS Ex2] A: 0.13931 | B: 0.34621 | C: 0.26974 ** [JOINT LOSS] ** : 0.883342 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007947 | Grad Max: 0.210566 -> Layer: shared_layers.0.bias | Grad Mean: 0.335731 | Grad Max: 1.455301 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.006512 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001296 | Grad Max: 0.001296 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002338 | Grad Max: 0.353433 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042201 | Grad Max: 1.973873 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000311 | Grad Max: 0.010175 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021137 | Grad Max: 0.112899 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000451 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004485 | Grad Max: 0.009621 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000210 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001113 | Grad Max: 0.002811 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000873 | Grad Max: 0.001768 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018891 | Grad Max: 0.018891 [GRADIENT NORM TOTAL] 7.2422 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.903 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5046031 0.4953969] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.071 [MASKS] A(Pass/Fail): 711/1337 | B: 622/1426 | C: 505/1543 [LOSS Ex1] A: 0.63717 | B: 0.62667 | C: 0.62526 [LOGITS Ex2 A] Mean Abs: 2.107 | Max: 8.059 [LOSS Ex2] A: 0.14490 | B: 0.32839 | C: 0.24335 ** [JOINT LOSS] ** : 0.868582 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005544 | Grad Max: 0.213998 -> Layer: shared_layers.0.bias | Grad Mean: 0.208179 | Grad Max: 0.752692 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002330 | Grad Max: 0.006387 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007138 | Grad Max: 0.007138 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001586 | Grad Max: 0.298433 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027542 | Grad Max: 1.636307 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.006502 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012560 | Grad Max: 0.067392 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000340 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002795 | Grad Max: 0.007123 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000139 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000713 | Grad Max: 0.001794 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000540 | Grad Max: 0.001699 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012604 | Grad Max: 0.012604 [GRADIENT NORM TOTAL] 4.8401 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.748 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50198615 0.49801382] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.070 [MASKS] A(Pass/Fail): 677/1371 | B: 586/1270 | C: 535/1513 [LOSS Ex1] A: 0.64548 | B: 0.63097 | C: 0.62326 [LOGITS Ex2 A] Mean Abs: 2.043 | Max: 6.303 [LOSS Ex2] A: 0.11916 | B: 0.34850 | C: 0.25533 ** [JOINT LOSS] ** : 0.874235 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003006 | Grad Max: 0.133608 -> Layer: shared_layers.0.bias | Grad Mean: 0.389931 | Grad Max: 1.798444 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.005575 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003379 | Grad Max: 0.003379 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002571 | Grad Max: 0.446711 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047838 | Grad Max: 2.524558 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000348 | Grad Max: 0.015514 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024998 | Grad Max: 0.165887 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000456 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005128 | Grad Max: 0.010821 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000244 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001322 | Grad Max: 0.003263 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000993 | Grad Max: 0.002162 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023876 | Grad Max: 0.023876 [GRADIENT NORM TOTAL] 8.7055 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.118 | Max: 0.670 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54370445 0.45629558] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.068 [MASKS] A(Pass/Fail): 680/1368 | B: 633/1415 | C: 490/1558 [LOSS Ex1] A: 0.64558 | B: 0.63016 | C: 0.62763 [LOGITS Ex2 A] Mean Abs: 2.024 | Max: 6.150 [LOSS Ex2] A: 0.12314 | B: 0.36452 | C: 0.25796 ** [JOINT LOSS] ** : 0.882997 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006419 | Grad Max: 0.154096 -> Layer: shared_layers.0.bias | Grad Mean: 0.460939 | Grad Max: 2.082536 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.006270 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010657 | Grad Max: 0.010657 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003004 | Grad Max: 0.538694 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055545 | Grad Max: 3.043663 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.012139 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027844 | Grad Max: 0.153649 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000515 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005868 | Grad Max: 0.011519 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000265 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001520 | Grad Max: 0.003608 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001187 | Grad Max: 0.002372 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028199 | Grad Max: 0.028199 [GRADIENT NORM TOTAL] 10.3885 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.841 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75345695 0.24654305] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.071 [MASKS] A(Pass/Fail): 740/1308 | B: 625/1423 | C: 497/1551 [LOSS Ex1] A: 0.63965 | B: 0.63085 | C: 0.62491 [LOGITS Ex2 A] Mean Abs: 2.110 | Max: 6.069 [LOSS Ex2] A: 0.11737 | B: 0.34326 | C: 0.25489 ** [JOINT LOSS] ** : 0.870312 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.075573 -> Layer: shared_layers.0.bias | Grad Mean: 0.179245 | Grad Max: 0.718249 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006431 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004931 | Grad Max: 0.004931 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001319 | Grad Max: 0.334750 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024082 | Grad Max: 1.865856 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000133 | Grad Max: 0.006618 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009524 | Grad Max: 0.052220 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000205 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001870 | Grad Max: 0.005010 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000126 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000462 | Grad Max: 0.001436 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000356 | Grad Max: 0.001204 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007334 | Grad Max: 0.007334 [GRADIENT NORM TOTAL] 4.9534 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.932 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50091046 0.4990895 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.070 [MASKS] A(Pass/Fail): 715/1333 | B: 622/1426 | C: 497/1551 [LOSS Ex1] A: 0.64625 | B: 0.62651 | C: 0.62867 [LOGITS Ex2 A] Mean Abs: 2.120 | Max: 5.736 [LOSS Ex2] A: 0.11387 | B: 0.33017 | C: 0.25859 ** [JOINT LOSS] ** : 0.868021 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003536 | Grad Max: 0.159679 -> Layer: shared_layers.0.bias | Grad Mean: 0.270588 | Grad Max: 1.491322 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.005975 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005942 | Grad Max: 0.005942 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001908 | Grad Max: 0.382657 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034068 | Grad Max: 2.122998 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.008332 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013831 | Grad Max: 0.086482 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000245 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002605 | Grad Max: 0.005962 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000122 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000683 | Grad Max: 0.001804 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000441 | Grad Max: 0.001738 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011356 | Grad Max: 0.011356 [GRADIENT NORM TOTAL] 6.5544 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.650 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.69879067 0.30120933] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.071 [MASKS] A(Pass/Fail): 706/1342 | B: 586/1270 | C: 372/1004 [LOSS Ex1] A: 0.64212 | B: 0.63082 | C: 0.61676 [LOGITS Ex2 A] Mean Abs: 2.087 | Max: 6.312 [LOSS Ex2] A: 0.13423 | B: 0.32664 | C: 0.23054 ** [JOINT LOSS] ** : 0.860372 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004170 | Grad Max: 0.131603 -> Layer: shared_layers.0.bias | Grad Mean: 0.143289 | Grad Max: 0.573071 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002357 | Grad Max: 0.006575 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007931 | Grad Max: 0.007931 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001093 | Grad Max: 0.167660 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018509 | Grad Max: 0.951221 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.005254 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007249 | Grad Max: 0.040660 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000243 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001575 | Grad Max: 0.005048 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000101 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000340 | Grad Max: 0.001205 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000382 | Grad Max: 0.001106 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003981 | Grad Max: 0.003981 [GRADIENT NORM TOTAL] 3.0544 [EPOCH SUMMARY] Train Loss: 0.8708 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8516 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8533 -> New: 0.8516) ############################## EPOCH 125/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.753 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62147266 0.37852737] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.071 [MASKS] A(Pass/Fail): 587/1029 | B: 633/1415 | C: 486/1562 [LOSS Ex1] A: 0.64039 | B: 0.63001 | C: 0.62838 [LOGITS Ex2 A] Mean Abs: 2.127 | Max: 7.139 [LOSS Ex2] A: 0.11752 | B: 0.35266 | C: 0.25929 ** [JOINT LOSS] ** : 0.876082 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002465 | Grad Max: 0.067582 -> Layer: shared_layers.0.bias | Grad Mean: 0.074624 | Grad Max: 0.351940 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.006256 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002766 | Grad Max: 0.002766 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000784 | Grad Max: 0.207510 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013454 | Grad Max: 1.169100 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.003632 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003481 | Grad Max: 0.031046 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000159 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000714 | Grad Max: 0.003120 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000072 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000184 | Grad Max: 0.000862 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000260 | Grad Max: 0.000885 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004068 | Grad Max: 0.004068 [GRADIENT NORM TOTAL] 2.6899 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.934 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072824 0.4927176] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.072 [MASKS] A(Pass/Fail): 712/1336 | B: 625/1423 | C: 519/1529 [LOSS Ex1] A: 0.64098 | B: 0.63069 | C: 0.62820 [LOGITS Ex2 A] Mean Abs: 2.128 | Max: 8.760 [LOSS Ex2] A: 0.11569 | B: 0.34948 | C: 0.25878 ** [JOINT LOSS] ** : 0.874604 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004232 | Grad Max: 0.112456 -> Layer: shared_layers.0.bias | Grad Mean: 0.343642 | Grad Max: 1.252653 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.006144 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006066 | Grad Max: 0.006066 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002402 | Grad Max: 0.353519 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044014 | Grad Max: 1.971685 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000306 | Grad Max: 0.010474 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021725 | Grad Max: 0.114854 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000390 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004493 | Grad Max: 0.009610 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000192 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001125 | Grad Max: 0.002641 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000760 | Grad Max: 0.001944 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018504 | Grad Max: 0.018504 [GRADIENT NORM TOTAL] 7.6088 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.881 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5104458 0.48955417] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.071 [MASKS] A(Pass/Fail): 706/1342 | B: 623/1425 | C: 530/1518 [LOSS Ex1] A: 0.63793 | B: 0.62634 | C: 0.61969 [LOGITS Ex2 A] Mean Abs: 2.109 | Max: 6.471 [LOSS Ex2] A: 0.12510 | B: 0.32997 | C: 0.24404 ** [JOINT LOSS] ** : 0.861020 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004264 | Grad Max: 0.130782 -> Layer: shared_layers.0.bias | Grad Mean: 0.190611 | Grad Max: 0.874468 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002397 | Grad Max: 0.006037 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000827 | Grad Max: 0.000827 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.402113 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025690 | Grad Max: 2.249467 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000149 | Grad Max: 0.005464 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010283 | Grad Max: 0.054728 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000243 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002261 | Grad Max: 0.006238 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000121 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000578 | Grad Max: 0.001345 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000415 | Grad Max: 0.001383 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010061 | Grad Max: 0.010061 [GRADIENT NORM TOTAL] 5.0143 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.908 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5046133 0.49538678] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.071 [MASKS] A(Pass/Fail): 712/1336 | B: 586/1270 | C: 520/1528 [LOSS Ex1] A: 0.63693 | B: 0.63064 | C: 0.62270 [LOGITS Ex2 A] Mean Abs: 2.051 | Max: 7.480 [LOSS Ex2] A: 0.14300 | B: 0.34102 | C: 0.25995 ** [JOINT LOSS] ** : 0.878078 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005625 | Grad Max: 0.205481 -> Layer: shared_layers.0.bias | Grad Mean: 0.370863 | Grad Max: 1.484509 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002287 | Grad Max: 0.006186 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001567 | Grad Max: 0.001567 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002376 | Grad Max: 0.267285 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041630 | Grad Max: 1.489181 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000309 | Grad Max: 0.014898 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021253 | Grad Max: 0.153727 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000414 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004003 | Grad Max: 0.008845 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000166 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001045 | Grad Max: 0.002311 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000804 | Grad Max: 0.001786 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019825 | Grad Max: 0.019825 [GRADIENT NORM TOTAL] 7.3174 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.752 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50188524 0.49811473] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.070 [MASKS] A(Pass/Fail): 679/1369 | B: 633/1415 | C: 519/1529 [LOSS Ex1] A: 0.64527 | B: 0.62983 | C: 0.62402 [LOGITS Ex2 A] Mean Abs: 2.029 | Max: 6.684 [LOSS Ex2] A: 0.12482 | B: 0.35018 | C: 0.23992 ** [JOINT LOSS] ** : 0.871348 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004044 | Grad Max: 0.106032 -> Layer: shared_layers.0.bias | Grad Mean: 0.339179 | Grad Max: 1.338906 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.005589 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003366 | Grad Max: 0.003366 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002238 | Grad Max: 0.252346 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041083 | Grad Max: 1.415466 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000288 | Grad Max: 0.012724 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020288 | Grad Max: 0.132517 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000366 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004010 | Grad Max: 0.008714 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000201 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001042 | Grad Max: 0.002658 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000832 | Grad Max: 0.002106 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019279 | Grad Max: 0.019279 [GRADIENT NORM TOTAL] 6.9014 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.675 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54379785 0.45620212] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.553 | Std: 0.068 [MASKS] A(Pass/Fail): 680/1368 | B: 625/1423 | C: 510/1538 [LOSS Ex1] A: 0.64539 | B: 0.63051 | C: 0.62689 [LOGITS Ex2 A] Mean Abs: 2.056 | Max: 5.959 [LOSS Ex2] A: 0.12718 | B: 0.35015 | C: 0.25767 ** [JOINT LOSS] ** : 0.879266 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003961 | Grad Max: 0.115911 -> Layer: shared_layers.0.bias | Grad Mean: 0.260088 | Grad Max: 1.402621 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.006012 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008918 | Grad Max: 0.008918 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001743 | Grad Max: 0.368945 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031025 | Grad Max: 2.048323 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000182 | Grad Max: 0.007288 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012786 | Grad Max: 0.068241 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000289 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002772 | Grad Max: 0.006677 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000146 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000729 | Grad Max: 0.001869 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000596 | Grad Max: 0.001732 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013550 | Grad Max: 0.013550 [GRADIENT NORM TOTAL] 6.1638 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.845 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7548706 0.24512938] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.072 [MASKS] A(Pass/Fail): 741/1307 | B: 623/1425 | C: 514/1534 [LOSS Ex1] A: 0.63942 | B: 0.62617 | C: 0.62419 [LOGITS Ex2 A] Mean Abs: 2.100 | Max: 5.940 [LOSS Ex2] A: 0.11880 | B: 0.33496 | C: 0.23848 ** [JOINT LOSS] ** : 0.860676 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004689 | Grad Max: 0.172172 -> Layer: shared_layers.0.bias | Grad Mean: 0.406955 | Grad Max: 2.203925 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.006439 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002127 | Grad Max: 0.002127 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002539 | Grad Max: 0.505533 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046822 | Grad Max: 2.823731 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000299 | Grad Max: 0.013042 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021387 | Grad Max: 0.125714 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000405 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004379 | Grad Max: 0.009522 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000212 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001110 | Grad Max: 0.002892 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000758 | Grad Max: 0.002232 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019150 | Grad Max: 0.019150 [GRADIENT NORM TOTAL] 9.2780 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.937 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50100857 0.49899143] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.071 [MASKS] A(Pass/Fail): 715/1333 | B: 586/1270 | C: 521/1527 [LOSS Ex1] A: 0.64605 | B: 0.63047 | C: 0.62361 [LOGITS Ex2 A] Mean Abs: 2.093 | Max: 6.294 [LOSS Ex2] A: 0.11150 | B: 0.32901 | C: 0.25909 ** [JOINT LOSS] ** : 0.866577 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004715 | Grad Max: 0.140409 -> Layer: shared_layers.0.bias | Grad Mean: 0.140224 | Grad Max: 0.547403 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002116 | Grad Max: 0.006191 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009237 | Grad Max: 0.009237 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001160 | Grad Max: 0.203874 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020555 | Grad Max: 1.134210 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000125 | Grad Max: 0.005224 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008013 | Grad Max: 0.048595 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000232 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001771 | Grad Max: 0.004468 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000115 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000461 | Grad Max: 0.001276 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000473 | Grad Max: 0.001455 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009326 | Grad Max: 0.009326 [GRADIENT NORM TOTAL] 3.3930 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.655 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.69975126 0.3002488 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.072 [MASKS] A(Pass/Fail): 706/1342 | B: 633/1415 | C: 501/1547 [LOSS Ex1] A: 0.64190 | B: 0.62967 | C: 0.62509 [LOGITS Ex2 A] Mean Abs: 2.104 | Max: 6.942 [LOSS Ex2] A: 0.14430 | B: 0.34875 | C: 0.24886 ** [JOINT LOSS] ** : 0.879524 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003016 | Grad Max: 0.081989 -> Layer: shared_layers.0.bias | Grad Mean: 0.130818 | Grad Max: 0.775195 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.006179 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003924 | Grad Max: 0.003924 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000985 | Grad Max: 0.237168 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016503 | Grad Max: 1.328306 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.004435 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005491 | Grad Max: 0.041808 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000159 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001016 | Grad Max: 0.003212 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000081 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000246 | Grad Max: 0.000933 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000235 | Grad Max: 0.000871 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004548 | Grad Max: 0.004548 [GRADIENT NORM TOTAL] 3.5062 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.758 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62194985 0.37805015] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.072 [MASKS] A(Pass/Fail): 587/1029 | B: 625/1423 | C: 516/1532 [LOSS Ex1] A: 0.64017 | B: 0.63036 | C: 0.62662 [LOGITS Ex2 A] Mean Abs: 2.126 | Max: 6.503 [LOSS Ex2] A: 0.12378 | B: 0.35147 | C: 0.23775 ** [JOINT LOSS] ** : 0.870047 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002781 | Grad Max: 0.074229 -> Layer: shared_layers.0.bias | Grad Mean: 0.189770 | Grad Max: 1.061653 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.006432 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011784 | Grad Max: 0.011784 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001120 | Grad Max: 0.432570 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020216 | Grad Max: 2.426720 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000107 | Grad Max: 0.005542 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007378 | Grad Max: 0.057204 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000196 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001521 | Grad Max: 0.004465 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000109 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000400 | Grad Max: 0.001177 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000348 | Grad Max: 0.001309 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007686 | Grad Max: 0.007686 [GRADIENT NORM TOTAL] 5.0158 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.939 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5072333 0.4927667] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.072 [MASKS] A(Pass/Fail): 712/1336 | B: 624/1424 | C: 511/1537 [LOSS Ex1] A: 0.64076 | B: 0.62601 | C: 0.62357 [LOGITS Ex2 A] Mean Abs: 2.121 | Max: 7.405 [LOSS Ex2] A: 0.11858 | B: 0.33344 | C: 0.24514 ** [JOINT LOSS] ** : 0.862500 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003053 | Grad Max: 0.078805 -> Layer: shared_layers.0.bias | Grad Mean: 0.149171 | Grad Max: 0.610070 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002277 | Grad Max: 0.006054 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002359 | Grad Max: 0.002359 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001143 | Grad Max: 0.326546 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020584 | Grad Max: 1.827532 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.007238 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008667 | Grad Max: 0.081118 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000218 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001857 | Grad Max: 0.004997 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000112 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000485 | Grad Max: 0.001367 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000419 | Grad Max: 0.001388 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008177 | Grad Max: 0.008177 [GRADIENT NORM TOTAL] 4.2366 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.886 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51034987 0.48965013] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.072 [MASKS] A(Pass/Fail): 706/1342 | B: 586/1270 | C: 505/1543 [LOSS Ex1] A: 0.63769 | B: 0.63032 | C: 0.62600 [LOGITS Ex2 A] Mean Abs: 2.114 | Max: 5.905 [LOSS Ex2] A: 0.12612 | B: 0.32918 | C: 0.23681 ** [JOINT LOSS] ** : 0.862041 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003265 | Grad Max: 0.100234 -> Layer: shared_layers.0.bias | Grad Mean: 0.264095 | Grad Max: 1.493723 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002301 | Grad Max: 0.006798 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007290 | Grad Max: 0.007290 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001536 | Grad Max: 0.507294 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027702 | Grad Max: 2.814058 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000135 | Grad Max: 0.005512 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009564 | Grad Max: 0.051570 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000214 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001823 | Grad Max: 0.005169 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000103 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000436 | Grad Max: 0.001517 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000351 | Grad Max: 0.001167 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006479 | Grad Max: 0.006479 [GRADIENT NORM TOTAL] 6.5018 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.914 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.504668 0.495332] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.071 [MASKS] A(Pass/Fail): 713/1335 | B: 634/1414 | C: 559/1489 [LOSS Ex1] A: 0.63669 | B: 0.62951 | C: 0.62059 [LOGITS Ex2 A] Mean Abs: 2.090 | Max: 5.775 [LOSS Ex2] A: 0.13576 | B: 0.36304 | C: 0.23714 ** [JOINT LOSS] ** : 0.874245 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005839 | Grad Max: 0.196620 -> Layer: shared_layers.0.bias | Grad Mean: 0.208357 | Grad Max: 0.981326 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006554 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000173 | Grad Max: 0.000173 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001389 | Grad Max: 0.528966 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023039 | Grad Max: 2.941603 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.006180 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004364 | Grad Max: 0.057481 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000154 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000512 | Grad Max: 0.002981 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000058 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000138 | Grad Max: 0.000742 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000211 | Grad Max: 0.000739 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003025 | Grad Max: 0.003025 [GRADIENT NORM TOTAL] 5.4922 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.756 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5018491 0.4981509] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.554 | Std: 0.070 [MASKS] A(Pass/Fail): 679/1369 | B: 625/1423 | C: 361/1015 [LOSS Ex1] A: 0.64505 | B: 0.63019 | C: 0.62346 [LOGITS Ex2 A] Mean Abs: 2.089 | Max: 5.945 [LOSS Ex2] A: 0.12461 | B: 0.34946 | C: 0.26225 ** [JOINT LOSS] ** : 0.878339 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003150 | Grad Max: 0.089742 -> Layer: shared_layers.0.bias | Grad Mean: 0.154896 | Grad Max: 0.672900 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.006165 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008049 | Grad Max: 0.008049 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001116 | Grad Max: 0.428503 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019872 | Grad Max: 2.389894 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.004951 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006258 | Grad Max: 0.050676 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000193 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001324 | Grad Max: 0.004167 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000080 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000328 | Grad Max: 0.001003 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000296 | Grad Max: 0.000976 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004505 | Grad Max: 0.004505 [GRADIENT NORM TOTAL] 4.5428 [EPOCH SUMMARY] Train Loss: 0.8710 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8506 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8516 -> New: 0.8506) ############################## EPOCH 126/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.119 | Max: 0.678 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.543824 0.45617598] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.068 [MASKS] A(Pass/Fail): 680/1368 | B: 625/1423 | C: 511/1537 [LOSS Ex1] A: 0.64518 | B: 0.62584 | C: 0.62627 [LOGITS Ex2 A] Mean Abs: 2.076 | Max: 5.628 [LOSS Ex2] A: 0.12662 | B: 0.32739 | C: 0.25217 ** [JOINT LOSS] ** : 0.867825 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001767 | Grad Max: 0.039948 -> Layer: shared_layers.0.bias | Grad Mean: 0.063048 | Grad Max: 0.290060 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006406 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011287 | Grad Max: 0.011287 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000570 | Grad Max: 0.160826 -> Layer: exit2_layers.0.bias | Grad Mean: 0.009771 | Grad Max: 0.902944 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.002897 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001862 | Grad Max: 0.023386 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000005 | Grad Max: 0.000133 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000299 | Grad Max: 0.002115 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000055 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000075 | Grad Max: 0.000485 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000351 | Grad Max: 0.001037 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000629 | Grad Max: 0.000629 [GRADIENT NORM TOTAL] 2.0422 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.850 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7560879 0.24391206] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.072 [MASKS] A(Pass/Fail): 741/1307 | B: 586/1270 | C: 498/1550 [LOSS Ex1] A: 0.63919 | B: 0.63015 | C: 0.62538 [LOGITS Ex2 A] Mean Abs: 2.101 | Max: 6.584 [LOSS Ex2] A: 0.11415 | B: 0.33238 | C: 0.25623 ** [JOINT LOSS] ** : 0.865828 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001987 | Grad Max: 0.070321 -> Layer: shared_layers.0.bias | Grad Mean: 0.099042 | Grad Max: 0.458998 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.006236 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002979 | Grad Max: 0.002979 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000734 | Grad Max: 0.247826 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012747 | Grad Max: 1.391934 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002472 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001967 | Grad Max: 0.017625 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000113 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000326 | Grad Max: 0.001897 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000055 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000088 | Grad Max: 0.000517 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000327 | Grad Max: 0.000934 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000496 | Grad Max: 0.000496 [GRADIENT NORM TOTAL] 3.1797 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.943 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009708 0.49902928] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.071 [MASKS] A(Pass/Fail): 716/1332 | B: 638/1410 | C: 494/1554 [LOSS Ex1] A: 0.64582 | B: 0.62934 | C: 0.62709 [LOGITS Ex2 A] Mean Abs: 2.101 | Max: 6.098 [LOSS Ex2] A: 0.11670 | B: 0.34451 | C: 0.24238 ** [JOINT LOSS] ** : 0.868612 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003717 | Grad Max: 0.125754 -> Layer: shared_layers.0.bias | Grad Mean: 0.117779 | Grad Max: 0.432238 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.005537 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003245 | Grad Max: 0.003245 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000999 | Grad Max: 0.227458 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016961 | Grad Max: 1.264416 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.003765 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006020 | Grad Max: 0.034133 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000197 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001428 | Grad Max: 0.004189 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000087 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000378 | Grad Max: 0.001017 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000408 | Grad Max: 0.001257 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006562 | Grad Max: 0.006562 [GRADIENT NORM TOTAL] 3.1551 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.659 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7006436 0.29935646] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.072 [MASKS] A(Pass/Fail): 706/1342 | B: 625/1423 | C: 507/1541 [LOSS Ex1] A: 0.64165 | B: 0.63001 | C: 0.62426 [LOGITS Ex2 A] Mean Abs: 2.112 | Max: 6.045 [LOSS Ex2] A: 0.13463 | B: 0.34435 | C: 0.22637 ** [JOINT LOSS] ** : 0.867087 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004127 | Grad Max: 0.143336 -> Layer: shared_layers.0.bias | Grad Mean: 0.332162 | Grad Max: 1.578519 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.005918 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002676 | Grad Max: 0.002676 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002230 | Grad Max: 0.539825 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040482 | Grad Max: 2.995574 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000268 | Grad Max: 0.010963 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019370 | Grad Max: 0.116074 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000352 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003909 | Grad Max: 0.008385 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000179 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001018 | Grad Max: 0.002656 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000689 | Grad Max: 0.001885 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018551 | Grad Max: 0.018551 [GRADIENT NORM TOTAL] 7.9410 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.763 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6224491 0.37755093] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.072 [MASKS] A(Pass/Fail): 587/1029 | B: 625/1423 | C: 513/1535 [LOSS Ex1] A: 0.63991 | B: 0.62565 | C: 0.62559 [LOGITS Ex2 A] Mean Abs: 2.138 | Max: 8.291 [LOSS Ex2] A: 0.11868 | B: 0.32845 | C: 0.24452 ** [JOINT LOSS] ** : 0.860934 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003128 | Grad Max: 0.099117 -> Layer: shared_layers.0.bias | Grad Mean: 0.196023 | Grad Max: 0.797530 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.006121 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006218 | Grad Max: 0.006218 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001362 | Grad Max: 0.338701 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024306 | Grad Max: 1.911547 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.006491 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008646 | Grad Max: 0.059074 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000218 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001573 | Grad Max: 0.004536 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000082 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000373 | Grad Max: 0.001018 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000384 | Grad Max: 0.000993 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005241 | Grad Max: 0.005241 [GRADIENT NORM TOTAL] 5.0170 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.946 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50734025 0.4926598 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.073 [MASKS] A(Pass/Fail): 712/1336 | B: 586/1270 | C: 506/1542 [LOSS Ex1] A: 0.64049 | B: 0.62996 | C: 0.62693 [LOGITS Ex2 A] Mean Abs: 2.092 | Max: 7.615 [LOSS Ex2] A: 0.12020 | B: 0.34366 | C: 0.27360 ** [JOINT LOSS] ** : 0.878279 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004007 | Grad Max: 0.126769 -> Layer: shared_layers.0.bias | Grad Mean: 0.378539 | Grad Max: 1.757590 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002289 | Grad Max: 0.006641 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010316 | Grad Max: 0.010316 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002261 | Grad Max: 0.453791 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041605 | Grad Max: 2.506737 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000279 | Grad Max: 0.008874 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020091 | Grad Max: 0.105333 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000333 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003951 | Grad Max: 0.008336 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000181 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001006 | Grad Max: 0.002521 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000757 | Grad Max: 0.001748 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018085 | Grad Max: 0.018085 [GRADIENT NORM TOTAL] 8.3907 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.892 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5102672 0.48973283] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.072 [MASKS] A(Pass/Fail): 707/1341 | B: 638/1410 | C: 515/1533 [LOSS Ex1] A: 0.63741 | B: 0.62915 | C: 0.62523 [LOGITS Ex2 A] Mean Abs: 2.102 | Max: 5.885 [LOSS Ex2] A: 0.12789 | B: 0.36112 | C: 0.24484 ** [JOINT LOSS] ** : 0.875217 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003674 | Grad Max: 0.169449 -> Layer: shared_layers.0.bias | Grad Mean: 0.408629 | Grad Max: 2.224521 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002213 | Grad Max: 0.006247 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000599 | Grad Max: 0.000599 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002483 | Grad Max: 0.514651 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045126 | Grad Max: 2.853621 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.011032 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020515 | Grad Max: 0.132069 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000353 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004029 | Grad Max: 0.008438 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000152 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001028 | Grad Max: 0.002419 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000710 | Grad Max: 0.001494 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018439 | Grad Max: 0.018439 [GRADIENT NORM TOTAL] 9.0799 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.920 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50482243 0.49517757] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.072 [MASKS] A(Pass/Fail): 714/1334 | B: 625/1423 | C: 525/1523 [LOSS Ex1] A: 0.63640 | B: 0.62983 | C: 0.62149 [LOGITS Ex2 A] Mean Abs: 2.111 | Max: 7.163 [LOSS Ex2] A: 0.13509 | B: 0.34013 | C: 0.23434 ** [JOINT LOSS] ** : 0.865762 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005726 | Grad Max: 0.236953 -> Layer: shared_layers.0.bias | Grad Mean: 0.133935 | Grad Max: 0.552369 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002261 | Grad Max: 0.006423 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000273 | Grad Max: 0.000273 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001155 | Grad Max: 0.347833 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019301 | Grad Max: 1.844001 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000115 | Grad Max: 0.004509 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007365 | Grad Max: 0.042698 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000306 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001754 | Grad Max: 0.004871 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000445 | Grad Max: 0.001274 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000360 | Grad Max: 0.001244 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006971 | Grad Max: 0.006971 [GRADIENT NORM TOTAL] 3.7945 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.762 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5019133 0.4980867] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.071 [MASKS] A(Pass/Fail): 681/1367 | B: 625/1423 | C: 537/1511 [LOSS Ex1] A: 0.64478 | B: 0.62547 | C: 0.62031 [LOGITS Ex2 A] Mean Abs: 2.099 | Max: 6.284 [LOSS Ex2] A: 0.11281 | B: 0.32663 | C: 0.24971 ** [JOINT LOSS] ** : 0.859901 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002584 | Grad Max: 0.112120 -> Layer: shared_layers.0.bias | Grad Mean: 0.244650 | Grad Max: 1.286583 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002268 | Grad Max: 0.006150 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007596 | Grad Max: 0.007597 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001512 | Grad Max: 0.290770 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027777 | Grad Max: 1.610811 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.007092 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012149 | Grad Max: 0.077968 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000249 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002427 | Grad Max: 0.005817 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000125 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000623 | Grad Max: 0.001731 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000459 | Grad Max: 0.001604 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011170 | Grad Max: 0.011170 [GRADIENT NORM TOTAL] 5.5823 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.683 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.543867 0.45613295] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.068 [MASKS] A(Pass/Fail): 680/1368 | B: 586/1270 | C: 549/1499 [LOSS Ex1] A: 0.64492 | B: 0.62978 | C: 0.62006 [LOGITS Ex2 A] Mean Abs: 2.047 | Max: 5.498 [LOSS Ex2] A: 0.12746 | B: 0.33337 | C: 0.22642 ** [JOINT LOSS] ** : 0.860671 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002830 | Grad Max: 0.062332 -> Layer: shared_layers.0.bias | Grad Mean: 0.176788 | Grad Max: 0.722882 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.005970 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006644 | Grad Max: 0.006644 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001357 | Grad Max: 0.483373 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024835 | Grad Max: 2.691286 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000162 | Grad Max: 0.007550 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011511 | Grad Max: 0.074518 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000343 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002340 | Grad Max: 0.007282 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000133 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000578 | Grad Max: 0.001817 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000434 | Grad Max: 0.001580 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009610 | Grad Max: 0.009610 [GRADIENT NORM TOTAL] 4.8680 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.856 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75755334 0.24244665] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.072 [MASKS] A(Pass/Fail): 742/1306 | B: 638/1410 | C: 536/1512 [LOSS Ex1] A: 0.63892 | B: 0.62898 | C: 0.61964 [LOGITS Ex2 A] Mean Abs: 2.128 | Max: 6.148 [LOSS Ex2] A: 0.11286 | B: 0.34943 | C: 0.23242 ** [JOINT LOSS] ** : 0.860750 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001879 | Grad Max: 0.041778 -> Layer: shared_layers.0.bias | Grad Mean: 0.099805 | Grad Max: 0.490642 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002332 | Grad Max: 0.006531 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006439 | Grad Max: 0.006439 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000672 | Grad Max: 0.474920 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011660 | Grad Max: 2.636407 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.003265 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001857 | Grad Max: 0.015689 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000114 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000327 | Grad Max: 0.002040 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000051 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000083 | Grad Max: 0.000551 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000326 | Grad Max: 0.000831 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001081 | Grad Max: 0.001081 [GRADIENT NORM TOTAL] 3.8033 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.949 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008488 0.4991512] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.071 [MASKS] A(Pass/Fail): 716/1332 | B: 626/1422 | C: 528/1520 [LOSS Ex1] A: 0.64557 | B: 0.62965 | C: 0.62048 [LOGITS Ex2 A] Mean Abs: 2.153 | Max: 5.766 [LOSS Ex2] A: 0.11593 | B: 0.34173 | C: 0.24269 ** [JOINT LOSS] ** : 0.865348 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003672 | Grad Max: 0.130656 -> Layer: shared_layers.0.bias | Grad Mean: 0.156219 | Grad Max: 0.743617 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005888 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003150 | Grad Max: 0.003150 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001116 | Grad Max: 0.253714 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018574 | Grad Max: 1.420228 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.004111 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002727 | Grad Max: 0.036392 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000148 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000329 | Grad Max: 0.002296 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000050 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000085 | Grad Max: 0.000569 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000321 | Grad Max: 0.000831 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000098 | Grad Max: 0.000098 [GRADIENT NORM TOTAL] 3.9851 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.663 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.70150274 0.2984972 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.072 [MASKS] A(Pass/Fail): 706/1342 | B: 625/1423 | C: 518/1530 [LOSS Ex1] A: 0.64138 | B: 0.62529 | C: 0.62922 [LOGITS Ex2 A] Mean Abs: 2.115 | Max: 5.811 [LOSS Ex2] A: 0.13789 | B: 0.33119 | C: 0.26169 ** [JOINT LOSS] ** : 0.875558 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004388 | Grad Max: 0.119497 -> Layer: shared_layers.0.bias | Grad Mean: 0.197790 | Grad Max: 0.850599 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005761 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000193 | Grad Max: 0.000193 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001457 | Grad Max: 0.357206 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025164 | Grad Max: 2.009839 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.004882 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008137 | Grad Max: 0.054543 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000180 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001546 | Grad Max: 0.004595 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000083 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000382 | Grad Max: 0.001091 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000258 | Grad Max: 0.001014 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006310 | Grad Max: 0.006310 [GRADIENT NORM TOTAL] 4.8915 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.767 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6228435 0.37715656] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.072 [MASKS] A(Pass/Fail): 587/1029 | B: 586/1270 | C: 374/1002 [LOSS Ex1] A: 0.63964 | B: 0.62961 | C: 0.62295 [LOGITS Ex2 A] Mean Abs: 2.168 | Max: 8.143 [LOSS Ex2] A: 0.11785 | B: 0.32551 | C: 0.27770 ** [JOINT LOSS] ** : 0.871083 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004318 | Grad Max: 0.144405 -> Layer: shared_layers.0.bias | Grad Mean: 0.269003 | Grad Max: 0.970389 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.006082 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003833 | Grad Max: 0.003833 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001710 | Grad Max: 0.341307 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030889 | Grad Max: 1.908977 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000201 | Grad Max: 0.007806 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014208 | Grad Max: 0.070728 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000312 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003032 | Grad Max: 0.006898 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000156 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000786 | Grad Max: 0.001989 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000673 | Grad Max: 0.001487 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014732 | Grad Max: 0.014732 [GRADIENT NORM TOTAL] 5.7052 [EPOCH SUMMARY] Train Loss: 0.8673 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8485 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8506 -> New: 0.8485) ############################## EPOCH 127/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.951 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50741833 0.4925817 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.073 [MASKS] A(Pass/Fail): 712/1336 | B: 638/1410 | C: 532/1516 [LOSS Ex1] A: 0.64024 | B: 0.62881 | C: 0.62270 [LOGITS Ex2 A] Mean Abs: 2.146 | Max: 8.052 [LOSS Ex2] A: 0.11707 | B: 0.35575 | C: 0.24977 ** [JOINT LOSS] ** : 0.871444 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003007 | Grad Max: 0.134821 -> Layer: shared_layers.0.bias | Grad Mean: 0.104221 | Grad Max: 0.448456 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.006444 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001940 | Grad Max: 0.001940 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000967 | Grad Max: 0.308257 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016111 | Grad Max: 1.702443 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.004641 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003447 | Grad Max: 0.040813 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000119 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000470 | Grad Max: 0.002704 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000055 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000126 | Grad Max: 0.000622 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000221 | Grad Max: 0.000698 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002706 | Grad Max: 0.002706 [GRADIENT NORM TOTAL] 3.3867 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.898 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5101735 0.4898265] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.072 [MASKS] A(Pass/Fail): 707/1341 | B: 627/1421 | C: 500/1548 [LOSS Ex1] A: 0.63715 | B: 0.62947 | C: 0.62535 [LOGITS Ex2 A] Mean Abs: 2.163 | Max: 5.265 [LOSS Ex2] A: 0.13043 | B: 0.34103 | C: 0.25449 ** [JOINT LOSS] ** : 0.872642 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006828 | Grad Max: 0.212290 -> Layer: shared_layers.0.bias | Grad Mean: 0.383471 | Grad Max: 1.442111 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002224 | Grad Max: 0.006231 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001975 | Grad Max: 0.001975 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002712 | Grad Max: 0.470730 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049490 | Grad Max: 2.624038 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000337 | Grad Max: 0.011384 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023829 | Grad Max: 0.124657 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000520 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005009 | Grad Max: 0.011195 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000231 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001261 | Grad Max: 0.003089 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000929 | Grad Max: 0.002192 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021668 | Grad Max: 0.021668 [GRADIENT NORM TOTAL] 8.3823 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.926 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5049619 0.49503812] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.072 [MASKS] A(Pass/Fail): 714/1334 | B: 625/1423 | C: 504/1544 [LOSS Ex1] A: 0.63614 | B: 0.62512 | C: 0.62413 [LOGITS Ex2 A] Mean Abs: 2.130 | Max: 7.850 [LOSS Ex2] A: 0.13873 | B: 0.33458 | C: 0.22906 ** [JOINT LOSS] ** : 0.862587 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006308 | Grad Max: 0.253398 -> Layer: shared_layers.0.bias | Grad Mean: 0.182261 | Grad Max: 0.638018 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002302 | Grad Max: 0.006521 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000302 | Grad Max: 0.000302 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001471 | Grad Max: 0.346287 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025166 | Grad Max: 1.904089 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000153 | Grad Max: 0.006095 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010327 | Grad Max: 0.058345 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000319 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002413 | Grad Max: 0.005867 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000131 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000622 | Grad Max: 0.001785 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000468 | Grad Max: 0.001695 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010640 | Grad Max: 0.010640 [GRADIENT NORM TOTAL] 4.5892 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.766 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5019249 0.49807513] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.071 [MASKS] A(Pass/Fail): 681/1367 | B: 587/1269 | C: 559/1489 [LOSS Ex1] A: 0.64453 | B: 0.62943 | C: 0.62076 [LOGITS Ex2 A] Mean Abs: 2.059 | Max: 6.195 [LOSS Ex2] A: 0.12073 | B: 0.34398 | C: 0.26219 ** [JOINT LOSS] ** : 0.873877 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004424 | Grad Max: 0.171149 -> Layer: shared_layers.0.bias | Grad Mean: 0.479756 | Grad Max: 2.250980 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.005591 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003947 | Grad Max: 0.003947 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003193 | Grad Max: 0.577851 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059689 | Grad Max: 3.257793 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.016781 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031412 | Grad Max: 0.179261 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000633 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006299 | Grad Max: 0.013369 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000284 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001603 | Grad Max: 0.003948 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001187 | Grad Max: 0.002381 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028609 | Grad Max: 0.028609 [GRADIENT NORM TOTAL] 10.8537 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.687 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.543837 0.456163] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.069 [MASKS] A(Pass/Fail): 680/1368 | B: 638/1410 | C: 530/1518 [LOSS Ex1] A: 0.64468 | B: 0.62863 | C: 0.62360 [LOGITS Ex2 A] Mean Abs: 2.051 | Max: 6.054 [LOSS Ex2] A: 0.13209 | B: 0.36166 | C: 0.24274 ** [JOINT LOSS] ** : 0.877799 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005931 | Grad Max: 0.176405 -> Layer: shared_layers.0.bias | Grad Mean: 0.479489 | Grad Max: 2.339409 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.005761 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008432 | Grad Max: 0.008432 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003257 | Grad Max: 0.553576 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060267 | Grad Max: 3.134782 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.016395 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029383 | Grad Max: 0.197911 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000590 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006098 | Grad Max: 0.013431 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000256 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001545 | Grad Max: 0.003652 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001153 | Grad Max: 0.002415 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027350 | Grad Max: 0.027350 [GRADIENT NORM TOTAL] 11.0472 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.861 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7587285 0.24127145] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.073 [MASKS] A(Pass/Fail): 742/1306 | B: 627/1421 | C: 533/1515 [LOSS Ex1] A: 0.63868 | B: 0.62931 | C: 0.62350 [LOGITS Ex2 A] Mean Abs: 2.137 | Max: 6.637 [LOSS Ex2] A: 0.11854 | B: 0.33715 | C: 0.24908 ** [JOINT LOSS] ** : 0.865416 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002686 | Grad Max: 0.078366 -> Layer: shared_layers.0.bias | Grad Mean: 0.090883 | Grad Max: 0.358620 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002207 | Grad Max: 0.006594 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003896 | Grad Max: 0.003896 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000858 | Grad Max: 0.185811 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014689 | Grad Max: 1.021797 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.004062 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002380 | Grad Max: 0.025830 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000121 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000336 | Grad Max: 0.002015 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000058 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000087 | Grad Max: 0.000512 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000345 | Grad Max: 0.000971 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000294 | Grad Max: 0.000294 [GRADIENT NORM TOTAL] 3.2112 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.954 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50082356 0.49917647] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.071 [MASKS] A(Pass/Fail): 716/1332 | B: 627/1421 | C: 522/1526 [LOSS Ex1] A: 0.64534 | B: 0.62495 | C: 0.62304 [LOGITS Ex2 A] Mean Abs: 2.153 | Max: 5.854 [LOSS Ex2] A: 0.10553 | B: 0.32409 | C: 0.23171 ** [JOINT LOSS] ** : 0.851554 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004252 | Grad Max: 0.149725 -> Layer: shared_layers.0.bias | Grad Mean: 0.354108 | Grad Max: 1.812570 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.005535 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000613 | Grad Max: 0.000613 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002409 | Grad Max: 0.345425 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043830 | Grad Max: 1.928179 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000300 | Grad Max: 0.014199 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021477 | Grad Max: 0.143479 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000357 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004087 | Grad Max: 0.009375 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000175 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001046 | Grad Max: 0.002378 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000690 | Grad Max: 0.002099 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018498 | Grad Max: 0.018498 [GRADIENT NORM TOTAL] 8.0195 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.667 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.70233244 0.29766756] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.072 [MASKS] A(Pass/Fail): 708/1340 | B: 588/1268 | C: 514/1534 [LOSS Ex1] A: 0.64115 | B: 0.62927 | C: 0.62501 [LOGITS Ex2 A] Mean Abs: 2.132 | Max: 6.845 [LOSS Ex2] A: 0.13196 | B: 0.31977 | C: 0.23611 ** [JOINT LOSS] ** : 0.861090 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004226 | Grad Max: 0.140490 -> Layer: shared_layers.0.bias | Grad Mean: 0.139578 | Grad Max: 0.630345 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.006123 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003142 | Grad Max: 0.003142 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001214 | Grad Max: 0.401176 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020184 | Grad Max: 2.250614 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.005339 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005306 | Grad Max: 0.053912 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000187 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000855 | Grad Max: 0.003276 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000238 | Grad Max: 0.000849 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000382 | Grad Max: 0.001091 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004282 | Grad Max: 0.004282 [GRADIENT NORM TOTAL] 4.2493 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.772 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62326705 0.37673295] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.073 [MASKS] A(Pass/Fail): 588/1028 | B: 639/1409 | C: 518/1530 [LOSS Ex1] A: 0.63940 | B: 0.62848 | C: 0.62524 [LOGITS Ex2 A] Mean Abs: 2.136 | Max: 7.115 [LOSS Ex2] A: 0.12387 | B: 0.34806 | C: 0.24925 ** [JOINT LOSS] ** : 0.871435 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004938 | Grad Max: 0.135684 -> Layer: shared_layers.0.bias | Grad Mean: 0.284065 | Grad Max: 1.212398 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.005938 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002998 | Grad Max: 0.002998 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001819 | Grad Max: 0.414730 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032804 | Grad Max: 2.324109 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.006963 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015858 | Grad Max: 0.080727 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000316 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003410 | Grad Max: 0.007253 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000164 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000882 | Grad Max: 0.002231 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000699 | Grad Max: 0.001648 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016234 | Grad Max: 0.016234 [GRADIENT NORM TOTAL] 6.1430 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.956 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074105 0.49258944] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.073 [MASKS] A(Pass/Fail): 712/1336 | B: 627/1421 | C: 546/1502 [LOSS Ex1] A: 0.64001 | B: 0.62916 | C: 0.62043 [LOGITS Ex2 A] Mean Abs: 2.107 | Max: 7.173 [LOSS Ex2] A: 0.11787 | B: 0.34057 | C: 0.25558 ** [JOINT LOSS] ** : 0.867875 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001859 | Grad Max: 0.040924 -> Layer: shared_layers.0.bias | Grad Mean: 0.075306 | Grad Max: 0.362504 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.005976 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000646 | Grad Max: 0.000646 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000744 | Grad Max: 0.320179 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012980 | Grad Max: 1.793447 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003210 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002999 | Grad Max: 0.034369 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000129 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000534 | Grad Max: 0.002855 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000066 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000135 | Grad Max: 0.000595 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000308 | Grad Max: 0.001062 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003730 | Grad Max: 0.003730 [GRADIENT NORM TOTAL] 3.1660 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.902 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51012504 0.48987496] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.073 [MASKS] A(Pass/Fail): 707/1341 | B: 627/1421 | C: 497/1551 [LOSS Ex1] A: 0.63692 | B: 0.62480 | C: 0.62594 [LOGITS Ex2 A] Mean Abs: 2.136 | Max: 5.546 [LOSS Ex2] A: 0.12075 | B: 0.32689 | C: 0.24621 ** [JOINT LOSS] ** : 0.860508 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004895 | Grad Max: 0.169071 -> Layer: shared_layers.0.bias | Grad Mean: 0.254663 | Grad Max: 1.450734 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.005931 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003508 | Grad Max: 0.003508 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001770 | Grad Max: 0.325539 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031450 | Grad Max: 1.821153 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000181 | Grad Max: 0.006624 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012598 | Grad Max: 0.073893 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000333 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002788 | Grad Max: 0.006574 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000124 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000708 | Grad Max: 0.001868 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000516 | Grad Max: 0.001633 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012378 | Grad Max: 0.012378 [GRADIENT NORM TOTAL] 6.0482 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.931 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.505041 0.494959] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.072 [MASKS] A(Pass/Fail): 714/1334 | B: 589/1267 | C: 534/1514 [LOSS Ex1] A: 0.63592 | B: 0.62912 | C: 0.62170 [LOGITS Ex2 A] Mean Abs: 2.118 | Max: 6.590 [LOSS Ex2] A: 0.14358 | B: 0.32609 | C: 0.24678 ** [JOINT LOSS] ** : 0.867724 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004576 | Grad Max: 0.214866 -> Layer: shared_layers.0.bias | Grad Mean: 0.107637 | Grad Max: 0.400812 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002244 | Grad Max: 0.006358 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002415 | Grad Max: 0.002415 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001071 | Grad Max: 0.298623 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017030 | Grad Max: 1.626941 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.003390 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002802 | Grad Max: 0.026889 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000179 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000621 | Grad Max: 0.003019 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000064 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000147 | Grad Max: 0.000615 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000254 | Grad Max: 0.000860 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001652 | Grad Max: 0.001652 [GRADIENT NORM TOTAL] 3.7537 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.770 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50189596 0.498104 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.071 [MASKS] A(Pass/Fail): 681/1367 | B: 639/1409 | C: 523/1525 [LOSS Ex1] A: 0.64432 | B: 0.62833 | C: 0.62275 [LOGITS Ex2 A] Mean Abs: 2.065 | Max: 6.270 [LOSS Ex2] A: 0.11933 | B: 0.34711 | C: 0.23877 ** [JOINT LOSS] ** : 0.866870 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002773 | Grad Max: 0.091100 -> Layer: shared_layers.0.bias | Grad Mean: 0.215909 | Grad Max: 1.162116 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.005864 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000155 | Grad Max: 0.000155 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001582 | Grad Max: 0.464444 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029032 | Grad Max: 2.601628 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000181 | Grad Max: 0.008291 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012995 | Grad Max: 0.082491 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000251 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002645 | Grad Max: 0.005798 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000145 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000694 | Grad Max: 0.001996 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000597 | Grad Max: 0.001885 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013750 | Grad Max: 0.013750 [GRADIENT NORM TOTAL] 5.8381 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.120 | Max: 0.691 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438614 0.45613867] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.069 [MASKS] A(Pass/Fail): 681/1367 | B: 627/1421 | C: 339/1037 [LOSS Ex1] A: 0.64448 | B: 0.62899 | C: 0.62260 [LOGITS Ex2 A] Mean Abs: 2.067 | Max: 5.261 [LOSS Ex2] A: 0.11889 | B: 0.33962 | C: 0.22981 ** [JOINT LOSS] ** : 0.861467 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001855 | Grad Max: 0.033617 -> Layer: shared_layers.0.bias | Grad Mean: 0.086932 | Grad Max: 0.457515 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005828 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005936 | Grad Max: 0.005936 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000689 | Grad Max: 0.358176 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012037 | Grad Max: 2.000298 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002572 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002043 | Grad Max: 0.017685 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000127 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000323 | Grad Max: 0.002430 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000057 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000082 | Grad Max: 0.000453 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000402 | Grad Max: 0.000882 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000449 | Grad Max: 0.000449 [GRADIENT NORM TOTAL] 3.3991 [EPOCH SUMMARY] Train Loss: 0.8666 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8482 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8485 -> New: 0.8482) ############################## EPOCH 128/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.865 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7599319 0.24006806] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.073 [MASKS] A(Pass/Fail): 744/1304 | B: 627/1421 | C: 503/1545 [LOSS Ex1] A: 0.63845 | B: 0.62463 | C: 0.62765 [LOGITS Ex2 A] Mean Abs: 2.138 | Max: 5.791 [LOSS Ex2] A: 0.10998 | B: 0.32057 | C: 0.24656 ** [JOINT LOSS] ** : 0.855951 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002615 | Grad Max: 0.083204 -> Layer: shared_layers.0.bias | Grad Mean: 0.205121 | Grad Max: 1.080095 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006403 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006141 | Grad Max: 0.006141 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001474 | Grad Max: 0.275856 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027116 | Grad Max: 1.531781 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.006757 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012599 | Grad Max: 0.080096 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000256 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002540 | Grad Max: 0.005972 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000124 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000634 | Grad Max: 0.001867 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000468 | Grad Max: 0.001679 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010563 | Grad Max: 0.010563 [GRADIENT NORM TOTAL] 5.0099 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.960 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008322 0.49916783] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.072 [MASKS] A(Pass/Fail): 716/1332 | B: 592/1264 | C: 518/1530 [LOSS Ex1] A: 0.64513 | B: 0.62895 | C: 0.62395 [LOGITS Ex2 A] Mean Abs: 2.138 | Max: 6.123 [LOSS Ex2] A: 0.11095 | B: 0.32394 | C: 0.23623 ** [JOINT LOSS] ** : 0.856381 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003508 | Grad Max: 0.140954 -> Layer: shared_layers.0.bias | Grad Mean: 0.092059 | Grad Max: 0.459773 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005727 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001036 | Grad Max: 0.001036 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000854 | Grad Max: 0.281484 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013835 | Grad Max: 1.583864 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000067 | Grad Max: 0.004304 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003880 | Grad Max: 0.031221 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000193 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000970 | Grad Max: 0.003881 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000085 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000233 | Grad Max: 0.000943 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000397 | Grad Max: 0.001150 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003841 | Grad Max: 0.003841 [GRADIENT NORM TOTAL] 2.9896 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.671 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.70319766 0.29680237] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.073 [MASKS] A(Pass/Fail): 708/1340 | B: 640/1408 | C: 524/1524 [LOSS Ex1] A: 0.64091 | B: 0.62815 | C: 0.62246 [LOGITS Ex2 A] Mean Abs: 2.131 | Max: 6.805 [LOSS Ex2] A: 0.14161 | B: 0.35807 | C: 0.22443 ** [JOINT LOSS] ** : 0.871879 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004510 | Grad Max: 0.153747 -> Layer: shared_layers.0.bias | Grad Mean: 0.153671 | Grad Max: 0.593802 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.006380 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000214 | Grad Max: 0.000214 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001250 | Grad Max: 0.145133 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021809 | Grad Max: 0.771722 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000141 | Grad Max: 0.005844 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009747 | Grad Max: 0.054967 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000303 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002134 | Grad Max: 0.005895 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000122 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000505 | Grad Max: 0.001543 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001424 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008024 | Grad Max: 0.008024 [GRADIENT NORM TOTAL] 3.4729 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.777 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62374854 0.37625143] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.073 [MASKS] A(Pass/Fail): 588/1028 | B: 628/1420 | C: 505/1543 [LOSS Ex1] A: 0.63916 | B: 0.62881 | C: 0.62502 [LOGITS Ex2 A] Mean Abs: 2.183 | Max: 7.842 [LOSS Ex2] A: 0.11775 | B: 0.34491 | C: 0.27732 ** [JOINT LOSS] ** : 0.877656 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004135 | Grad Max: 0.154977 -> Layer: shared_layers.0.bias | Grad Mean: 0.416544 | Grad Max: 2.050894 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006584 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010267 | Grad Max: 0.010267 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002671 | Grad Max: 0.398786 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049520 | Grad Max: 2.208066 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000331 | Grad Max: 0.011998 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024121 | Grad Max: 0.129058 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000476 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004836 | Grad Max: 0.011438 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000219 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001184 | Grad Max: 0.003023 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000761 | Grad Max: 0.001822 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018718 | Grad Max: 0.018718 [GRADIENT NORM TOTAL] 9.1739 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.962 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50740767 0.49259233] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.073 [MASKS] A(Pass/Fail): 714/1334 | B: 628/1420 | C: 527/1521 [LOSS Ex1] A: 0.63977 | B: 0.62444 | C: 0.61964 [LOGITS Ex2 A] Mean Abs: 2.171 | Max: 7.861 [LOSS Ex2] A: 0.11687 | B: 0.32640 | C: 0.25469 ** [JOINT LOSS] ** : 0.860605 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005442 | Grad Max: 0.178751 -> Layer: shared_layers.0.bias | Grad Mean: 0.344632 | Grad Max: 1.360460 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002347 | Grad Max: 0.006267 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002449 | Grad Max: 0.002449 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002414 | Grad Max: 0.406335 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043773 | Grad Max: 2.281309 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000294 | Grad Max: 0.012798 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021041 | Grad Max: 0.128095 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000414 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004447 | Grad Max: 0.009092 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000199 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001140 | Grad Max: 0.002754 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000803 | Grad Max: 0.002157 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020009 | Grad Max: 0.020009 [GRADIENT NORM TOTAL] 7.7682 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.907 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5101135 0.48988655] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.073 [MASKS] A(Pass/Fail): 707/1341 | B: 592/1264 | C: 549/1499 [LOSS Ex1] A: 0.63667 | B: 0.62876 | C: 0.62073 [LOGITS Ex2 A] Mean Abs: 2.133 | Max: 6.342 [LOSS Ex2] A: 0.12590 | B: 0.33511 | C: 0.24792 ** [JOINT LOSS] ** : 0.865027 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003479 | Grad Max: 0.092943 -> Layer: shared_layers.0.bias | Grad Mean: 0.234595 | Grad Max: 0.930831 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.007138 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006234 | Grad Max: 0.006234 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001488 | Grad Max: 0.181911 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026194 | Grad Max: 1.008165 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000157 | Grad Max: 0.006365 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011258 | Grad Max: 0.064774 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000259 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002097 | Grad Max: 0.005365 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000115 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000522 | Grad Max: 0.001540 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000344 | Grad Max: 0.001248 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008976 | Grad Max: 0.008976 [GRADIENT NORM TOTAL] 4.6507 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.936 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5050783 0.49492168] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.073 [MASKS] A(Pass/Fail): 714/1334 | B: 640/1408 | C: 527/1521 [LOSS Ex1] A: 0.63566 | B: 0.62797 | C: 0.62463 [LOGITS Ex2 A] Mean Abs: 2.121 | Max: 6.627 [LOSS Ex2] A: 0.13223 | B: 0.34635 | C: 0.24180 ** [JOINT LOSS] ** : 0.869544 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004543 | Grad Max: 0.176911 -> Layer: shared_layers.0.bias | Grad Mean: 0.248644 | Grad Max: 1.137499 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002219 | Grad Max: 0.006122 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002953 | Grad Max: 0.002953 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001701 | Grad Max: 0.253466 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029109 | Grad Max: 1.391820 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.007409 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011723 | Grad Max: 0.091462 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000201 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002065 | Grad Max: 0.005173 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000548 | Grad Max: 0.001451 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000389 | Grad Max: 0.001177 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010707 | Grad Max: 0.010707 [GRADIENT NORM TOTAL] 5.4486 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.774 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50182736 0.49817264] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.072 [MASKS] A(Pass/Fail): 681/1367 | B: 629/1419 | C: 534/1514 [LOSS Ex1] A: 0.64409 | B: 0.62864 | C: 0.62178 [LOGITS Ex2 A] Mean Abs: 2.122 | Max: 7.454 [LOSS Ex2] A: 0.11489 | B: 0.34998 | C: 0.22742 ** [JOINT LOSS] ** : 0.862265 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003986 | Grad Max: 0.122462 -> Layer: shared_layers.0.bias | Grad Mean: 0.151273 | Grad Max: 0.604920 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005786 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006095 | Grad Max: 0.006095 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001135 | Grad Max: 0.240657 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019734 | Grad Max: 1.360889 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000095 | Grad Max: 0.003864 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006449 | Grad Max: 0.035613 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000228 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001510 | Grad Max: 0.004377 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000087 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000392 | Grad Max: 0.001138 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000370 | Grad Max: 0.001240 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006395 | Grad Max: 0.006395 [GRADIENT NORM TOTAL] 3.8689 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.695 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439652 0.45603484] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.069 [MASKS] A(Pass/Fail): 682/1366 | B: 631/1417 | C: 501/1547 [LOSS Ex1] A: 0.64425 | B: 0.62427 | C: 0.62642 [LOGITS Ex2 A] Mean Abs: 2.108 | Max: 6.342 [LOSS Ex2] A: 0.13127 | B: 0.32761 | C: 0.22973 ** [JOINT LOSS] ** : 0.861184 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003660 | Grad Max: 0.126367 -> Layer: shared_layers.0.bias | Grad Mean: 0.154341 | Grad Max: 0.614713 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002154 | Grad Max: 0.005801 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008483 | Grad Max: 0.008483 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001150 | Grad Max: 0.363852 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019773 | Grad Max: 2.064565 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.004569 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004235 | Grad Max: 0.039699 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000158 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000685 | Grad Max: 0.002561 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000202 | Grad Max: 0.000737 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000355 | Grad Max: 0.001016 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004284 | Grad Max: 0.004284 [GRADIENT NORM TOTAL] 4.6306 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.870 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7613682 0.23863181] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.073 [MASKS] A(Pass/Fail): 745/1303 | B: 592/1264 | C: 526/1522 [LOSS Ex1] A: 0.63820 | B: 0.62859 | C: 0.62202 [LOGITS Ex2 A] Mean Abs: 2.137 | Max: 6.818 [LOSS Ex2] A: 0.11236 | B: 0.32666 | C: 0.25667 ** [JOINT LOSS] ** : 0.861501 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.041869 -> Layer: shared_layers.0.bias | Grad Mean: 0.102976 | Grad Max: 0.504307 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002330 | Grad Max: 0.006193 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010882 | Grad Max: 0.010882 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000807 | Grad Max: 0.329785 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014378 | Grad Max: 1.849806 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.002703 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002187 | Grad Max: 0.026585 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000131 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000362 | Grad Max: 0.002093 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000049 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000099 | Grad Max: 0.000457 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000301 | Grad Max: 0.000737 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000581 | Grad Max: 0.000581 [GRADIENT NORM TOTAL] 3.9964 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.965 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50083625 0.4991637 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.072 [MASKS] A(Pass/Fail): 716/1332 | B: 640/1408 | C: 545/1503 [LOSS Ex1] A: 0.64490 | B: 0.62780 | C: 0.62135 [LOGITS Ex2 A] Mean Abs: 2.146 | Max: 5.835 [LOSS Ex2] A: 0.10832 | B: 0.34493 | C: 0.23973 ** [JOINT LOSS] ** : 0.862343 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005429 | Grad Max: 0.210309 -> Layer: shared_layers.0.bias | Grad Mean: 0.207324 | Grad Max: 0.913979 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002143 | Grad Max: 0.005712 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000847 | Grad Max: 0.000847 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.438434 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025096 | Grad Max: 2.476779 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000139 | Grad Max: 0.004634 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009247 | Grad Max: 0.041349 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000286 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002160 | Grad Max: 0.005931 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000133 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000541 | Grad Max: 0.001524 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000507 | Grad Max: 0.001745 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009904 | Grad Max: 0.009904 [GRADIENT NORM TOTAL] 5.0769 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.676 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.70413005 0.29587 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.073 [MASKS] A(Pass/Fail): 709/1339 | B: 630/1418 | C: 550/1498 [LOSS Ex1] A: 0.64067 | B: 0.62846 | C: 0.61806 [LOGITS Ex2 A] Mean Abs: 2.155 | Max: 6.460 [LOSS Ex2] A: 0.12937 | B: 0.34655 | C: 0.23056 ** [JOINT LOSS] ** : 0.864550 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004432 | Grad Max: 0.152575 -> Layer: shared_layers.0.bias | Grad Mean: 0.209173 | Grad Max: 1.272265 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006150 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008407 | Grad Max: 0.008407 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001384 | Grad Max: 0.247984 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023776 | Grad Max: 1.325265 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.004987 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006416 | Grad Max: 0.048588 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000174 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001005 | Grad Max: 0.003526 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000098 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000223 | Grad Max: 0.000975 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000293 | Grad Max: 0.000869 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002397 | Grad Max: 0.002397 [GRADIENT NORM TOTAL] 4.6348 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.782 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62425774 0.37574223] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.073 [MASKS] A(Pass/Fail): 588/1028 | B: 631/1417 | C: 517/1531 [LOSS Ex1] A: 0.63892 | B: 0.62409 | C: 0.62466 [LOGITS Ex2 A] Mean Abs: 2.207 | Max: 7.404 [LOSS Ex2] A: 0.12043 | B: 0.32705 | C: 0.23070 ** [JOINT LOSS] ** : 0.855281 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002554 | Grad Max: 0.088602 -> Layer: shared_layers.0.bias | Grad Mean: 0.143342 | Grad Max: 0.875093 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.005790 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005515 | Grad Max: 0.005515 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000958 | Grad Max: 0.184235 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016698 | Grad Max: 1.021310 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.004360 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005594 | Grad Max: 0.042428 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000174 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000994 | Grad Max: 0.003395 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000093 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000246 | Grad Max: 0.001087 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000310 | Grad Max: 0.001112 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003575 | Grad Max: 0.003575 [GRADIENT NORM TOTAL] 3.2787 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.967 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50739634 0.49260363] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.074 [MASKS] A(Pass/Fail): 714/1334 | B: 592/1264 | C: 370/1006 [LOSS Ex1] A: 0.63952 | B: 0.62840 | C: 0.61917 [LOGITS Ex2 A] Mean Abs: 2.152 | Max: 8.942 [LOSS Ex2] A: 0.11609 | B: 0.33557 | C: 0.25745 ** [JOINT LOSS] ** : 0.865400 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003068 | Grad Max: 0.107620 -> Layer: shared_layers.0.bias | Grad Mean: 0.281411 | Grad Max: 1.268841 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.006222 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000314 | Grad Max: 0.000314 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001735 | Grad Max: 0.508929 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031363 | Grad Max: 2.837721 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.009785 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014486 | Grad Max: 0.103243 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000239 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002747 | Grad Max: 0.006054 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000138 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000724 | Grad Max: 0.001883 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000582 | Grad Max: 0.001766 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013887 | Grad Max: 0.013887 [GRADIENT NORM TOTAL] 6.3713 [EPOCH SUMMARY] Train Loss: 0.8635 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8463 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8482 -> New: 0.8463) ############################## EPOCH 129/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.913 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5100385 0.4899615] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.073 [MASKS] A(Pass/Fail): 708/1340 | B: 640/1408 | C: 553/1495 [LOSS Ex1] A: 0.63641 | B: 0.62761 | C: 0.61910 [LOGITS Ex2 A] Mean Abs: 2.161 | Max: 6.016 [LOSS Ex2] A: 0.12278 | B: 0.35157 | C: 0.23747 ** [JOINT LOSS] ** : 0.864985 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005707 | Grad Max: 0.207629 -> Layer: shared_layers.0.bias | Grad Mean: 0.211517 | Grad Max: 0.984209 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.006585 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003535 | Grad Max: 0.003535 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001432 | Grad Max: 0.526078 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023650 | Grad Max: 2.937972 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000092 | Grad Max: 0.006379 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004580 | Grad Max: 0.049479 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000132 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000558 | Grad Max: 0.002835 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000069 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000142 | Grad Max: 0.000805 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000187 | Grad Max: 0.000672 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002181 | Grad Max: 0.002181 [GRADIENT NORM TOTAL] 5.3454 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.942 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5051503 0.49484965] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.073 [MASKS] A(Pass/Fail): 715/1333 | B: 630/1418 | C: 553/1495 [LOSS Ex1] A: 0.63540 | B: 0.62827 | C: 0.61759 [LOGITS Ex2 A] Mean Abs: 2.142 | Max: 8.446 [LOSS Ex2] A: 0.14214 | B: 0.35194 | C: 0.22494 ** [JOINT LOSS] ** : 0.866759 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007885 | Grad Max: 0.344048 -> Layer: shared_layers.0.bias | Grad Mean: 0.207825 | Grad Max: 0.667572 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002270 | Grad Max: 0.006492 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003923 | Grad Max: 0.003923 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001698 | Grad Max: 0.296531 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029015 | Grad Max: 1.611314 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000182 | Grad Max: 0.006257 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011960 | Grad Max: 0.056236 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000358 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002790 | Grad Max: 0.006602 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000149 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000710 | Grad Max: 0.001774 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000552 | Grad Max: 0.001509 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011918 | Grad Max: 0.011918 [GRADIENT NORM TOTAL] 5.2166 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.779 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017795 0.49822047] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.555 | Std: 0.072 [MASKS] A(Pass/Fail): 681/1367 | B: 633/1415 | C: 537/1511 [LOSS Ex1] A: 0.64385 | B: 0.62390 | C: 0.62108 [LOGITS Ex2 A] Mean Abs: 2.108 | Max: 5.979 [LOSS Ex2] A: 0.11353 | B: 0.32649 | C: 0.22647 ** [JOINT LOSS] ** : 0.851773 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002512 | Grad Max: 0.068685 -> Layer: shared_layers.0.bias | Grad Mean: 0.085445 | Grad Max: 0.333977 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.005882 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006021 | Grad Max: 0.006021 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000768 | Grad Max: 0.172553 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013078 | Grad Max: 0.955764 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.003502 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003696 | Grad Max: 0.031683 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000148 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000611 | Grad Max: 0.003068 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000149 | Grad Max: 0.000839 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000464 | Grad Max: 0.001069 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001594 | Grad Max: 0.001594 [GRADIENT NORM TOTAL] 2.4660 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.121 | Max: 0.700 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439862 0.45601383] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.554 | Std: 0.070 [MASKS] A(Pass/Fail): 682/1366 | B: 592/1264 | C: 529/1519 [LOSS Ex1] A: 0.64402 | B: 0.62821 | C: 0.62181 [LOGITS Ex2 A] Mean Abs: 2.101 | Max: 5.940 [LOSS Ex2] A: 0.11997 | B: 0.31643 | C: 0.24345 ** [JOINT LOSS] ** : 0.857961 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002782 | Grad Max: 0.077492 -> Layer: shared_layers.0.bias | Grad Mean: 0.094337 | Grad Max: 0.517804 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.006151 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009549 | Grad Max: 0.009549 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000745 | Grad Max: 0.261643 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012918 | Grad Max: 1.455987 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.002768 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002723 | Grad Max: 0.021789 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000147 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000535 | Grad Max: 0.002740 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000136 | Grad Max: 0.000658 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000354 | Grad Max: 0.001292 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002532 | Grad Max: 0.002533 [GRADIENT NORM TOTAL] 3.0685 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.876 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7627915 0.23720849] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.074 [MASKS] A(Pass/Fail): 745/1303 | B: 641/1407 | C: 547/1501 [LOSS Ex1] A: 0.63794 | B: 0.62742 | C: 0.61778 [LOGITS Ex2 A] Mean Abs: 2.166 | Max: 6.109 [LOSS Ex2] A: 0.11464 | B: 0.35120 | C: 0.25397 ** [JOINT LOSS] ** : 0.867652 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003156 | Grad Max: 0.097638 -> Layer: shared_layers.0.bias | Grad Mean: 0.233562 | Grad Max: 1.214011 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002258 | Grad Max: 0.005944 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001078 | Grad Max: 0.001078 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001694 | Grad Max: 0.370108 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030379 | Grad Max: 2.070881 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000180 | Grad Max: 0.009348 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012966 | Grad Max: 0.097438 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000258 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002396 | Grad Max: 0.005973 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000132 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000581 | Grad Max: 0.001831 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000323 | Grad Max: 0.001119 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009025 | Grad Max: 0.009025 [GRADIENT NORM TOTAL] 5.8435 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.971 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50080895 0.49919108] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.072 [MASKS] A(Pass/Fail): 716/1332 | B: 630/1418 | C: 514/1534 [LOSS Ex1] A: 0.64465 | B: 0.62808 | C: 0.62027 [LOGITS Ex2 A] Mean Abs: 2.173 | Max: 5.905 [LOSS Ex2] A: 0.10983 | B: 0.34399 | C: 0.24043 ** [JOINT LOSS] ** : 0.862414 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005196 | Grad Max: 0.189595 -> Layer: shared_layers.0.bias | Grad Mean: 0.148567 | Grad Max: 0.507771 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.005805 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006182 | Grad Max: 0.006182 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001067 | Grad Max: 0.418932 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017068 | Grad Max: 2.363113 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.003650 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003527 | Grad Max: 0.027901 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000226 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000927 | Grad Max: 0.003346 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000246 | Grad Max: 0.000888 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000361 | Grad Max: 0.001304 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005536 | Grad Max: 0.005536 [GRADIENT NORM TOTAL] 4.1787 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.681 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.70503616 0.29496378] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.073 [MASKS] A(Pass/Fail): 709/1339 | B: 633/1415 | C: 528/1520 [LOSS Ex1] A: 0.64039 | B: 0.62370 | C: 0.62203 [LOGITS Ex2 A] Mean Abs: 2.145 | Max: 6.514 [LOSS Ex2] A: 0.13376 | B: 0.31815 | C: 0.22229 ** [JOINT LOSS] ** : 0.853443 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006961 | Grad Max: 0.228961 -> Layer: shared_layers.0.bias | Grad Mean: 0.214614 | Grad Max: 0.749895 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002213 | Grad Max: 0.006227 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000680 | Grad Max: 0.000680 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001285 | Grad Max: 0.504285 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021311 | Grad Max: 2.806099 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.004544 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006559 | Grad Max: 0.038096 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000266 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001626 | Grad Max: 0.004341 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000106 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000416 | Grad Max: 0.001285 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.001492 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007309 | Grad Max: 0.007309 [GRADIENT NORM TOTAL] 5.6035 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.787 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62469965 0.3753004 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.074 [MASKS] A(Pass/Fail): 588/1028 | B: 592/1264 | C: 554/1494 [LOSS Ex1] A: 0.63863 | B: 0.62802 | C: 0.62388 [LOGITS Ex2 A] Mean Abs: 2.210 | Max: 7.787 [LOSS Ex2] A: 0.11685 | B: 0.31523 | C: 0.24967 ** [JOINT LOSS] ** : 0.857424 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003356 | Grad Max: 0.129512 -> Layer: shared_layers.0.bias | Grad Mean: 0.147307 | Grad Max: 0.570003 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.006253 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001580 | Grad Max: 0.001580 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000845 | Grad Max: 0.410632 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013884 | Grad Max: 2.279090 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003057 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002126 | Grad Max: 0.024275 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000133 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000386 | Grad Max: 0.002199 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000062 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000099 | Grad Max: 0.000625 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000262 | Grad Max: 0.000788 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001169 | Grad Max: 0.001169 [GRADIENT NORM TOTAL] 4.3381 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.974 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074751 0.4925249] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.074 [MASKS] A(Pass/Fail): 714/1334 | B: 642/1406 | C: 513/1535 [LOSS Ex1] A: 0.63924 | B: 0.62723 | C: 0.62447 [LOGITS Ex2 A] Mean Abs: 2.208 | Max: 8.700 [LOSS Ex2] A: 0.11753 | B: 0.34012 | C: 0.24563 ** [JOINT LOSS] ** : 0.864747 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004040 | Grad Max: 0.114439 -> Layer: shared_layers.0.bias | Grad Mean: 0.220191 | Grad Max: 1.464141 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.006031 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003742 | Grad Max: 0.003742 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001500 | Grad Max: 0.283386 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027368 | Grad Max: 1.588432 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000204 | Grad Max: 0.007690 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014592 | Grad Max: 0.080773 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000313 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002986 | Grad Max: 0.007426 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000139 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000710 | Grad Max: 0.001927 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000472 | Grad Max: 0.001339 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011050 | Grad Max: 0.011050 [GRADIENT NORM TOTAL] 4.9043 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.919 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50987405 0.49012598] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.074 [MASKS] A(Pass/Fail): 708/1340 | B: 630/1418 | C: 508/1540 [LOSS Ex1] A: 0.63612 | B: 0.62789 | C: 0.62564 [LOGITS Ex2 A] Mean Abs: 2.178 | Max: 6.038 [LOSS Ex2] A: 0.12251 | B: 0.33402 | C: 0.24135 ** [JOINT LOSS] ** : 0.862506 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004917 | Grad Max: 0.178054 -> Layer: shared_layers.0.bias | Grad Mean: 0.219822 | Grad Max: 0.897702 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.005854 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000348 | Grad Max: 0.000348 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001460 | Grad Max: 0.199337 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024761 | Grad Max: 1.061628 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000138 | Grad Max: 0.005404 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009038 | Grad Max: 0.053103 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001454 | Grad Max: 0.004242 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000085 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000360 | Grad Max: 0.001154 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000278 | Grad Max: 0.001024 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006434 | Grad Max: 0.006434 [GRADIENT NORM TOTAL] 4.5771 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.948 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50530034 0.49469963] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.073 [MASKS] A(Pass/Fail): 715/1333 | B: 635/1413 | C: 502/1546 [LOSS Ex1] A: 0.63510 | B: 0.62351 | C: 0.62582 [LOGITS Ex2 A] Mean Abs: 2.136 | Max: 6.846 [LOSS Ex2] A: 0.13957 | B: 0.33385 | C: 0.23885 ** [JOINT LOSS] ** : 0.865569 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005888 | Grad Max: 0.259829 -> Layer: shared_layers.0.bias | Grad Mean: 0.174381 | Grad Max: 0.675488 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002332 | Grad Max: 0.006217 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006307 | Grad Max: 0.006307 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001432 | Grad Max: 0.155981 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023032 | Grad Max: 0.828638 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000107 | Grad Max: 0.004686 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005247 | Grad Max: 0.050639 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000165 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000684 | Grad Max: 0.002984 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000057 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000200 | Grad Max: 0.000850 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000177 | Grad Max: 0.000821 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004996 | Grad Max: 0.004996 [GRADIENT NORM TOTAL] 3.9123 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.784 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.501828 0.498172] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.072 [MASKS] A(Pass/Fail): 681/1367 | B: 592/1264 | C: 539/1509 [LOSS Ex1] A: 0.64357 | B: 0.62782 | C: 0.62068 [LOGITS Ex2 A] Mean Abs: 2.130 | Max: 6.576 [LOSS Ex2] A: 0.12014 | B: 0.32041 | C: 0.25677 ** [JOINT LOSS] ** : 0.863130 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003888 | Grad Max: 0.113401 -> Layer: shared_layers.0.bias | Grad Mean: 0.123345 | Grad Max: 0.521762 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005793 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002955 | Grad Max: 0.002955 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000975 | Grad Max: 0.292307 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016981 | Grad Max: 1.642169 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000066 | Grad Max: 0.003061 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003712 | Grad Max: 0.026231 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000179 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000787 | Grad Max: 0.003067 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000076 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000183 | Grad Max: 0.000840 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000257 | Grad Max: 0.000867 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002452 | Grad Max: 0.002452 [GRADIENT NORM TOTAL] 3.8777 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.704 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54393697 0.45606303] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.070 [MASKS] A(Pass/Fail): 683/1365 | B: 642/1406 | C: 526/1522 [LOSS Ex1] A: 0.64375 | B: 0.62705 | C: 0.62291 [LOGITS Ex2 A] Mean Abs: 2.122 | Max: 6.332 [LOSS Ex2] A: 0.12625 | B: 0.34263 | C: 0.25152 ** [JOINT LOSS] ** : 0.871369 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.057211 -> Layer: shared_layers.0.bias | Grad Mean: 0.115348 | Grad Max: 0.564649 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.005507 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005440 | Grad Max: 0.005440 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000860 | Grad Max: 0.181933 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014978 | Grad Max: 1.017726 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.004595 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004542 | Grad Max: 0.037667 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000179 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000805 | Grad Max: 0.003744 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000081 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000171 | Grad Max: 0.000914 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000209 | Grad Max: 0.000624 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001138 | Grad Max: 0.001138 [GRADIENT NORM TOTAL] 3.0166 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.881 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7641752 0.2358248] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.074 [MASKS] A(Pass/Fail): 745/1303 | B: 631/1417 | C: 341/1035 [LOSS Ex1] A: 0.63767 | B: 0.62770 | C: 0.62957 [LOGITS Ex2 A] Mean Abs: 2.152 | Max: 6.342 [LOSS Ex2] A: 0.11494 | B: 0.34413 | C: 0.27534 ** [JOINT LOSS] ** : 0.876448 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004239 | Grad Max: 0.123743 -> Layer: shared_layers.0.bias | Grad Mean: 0.100387 | Grad Max: 0.535738 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.006120 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010006 | Grad Max: 0.010006 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000926 | Grad Max: 0.136823 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015634 | Grad Max: 0.760092 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.003419 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004232 | Grad Max: 0.026185 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000181 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000957 | Grad Max: 0.003331 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000085 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000255 | Grad Max: 0.000936 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000287 | Grad Max: 0.001055 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005638 | Grad Max: 0.005638 [GRADIENT NORM TOTAL] 2.7645 [EPOCH SUMMARY] Train Loss: 0.8633 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8453 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8463 -> New: 0.8453) ############################## EPOCH 130/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.978 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50072485 0.49927515] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.073 [MASKS] A(Pass/Fail): 716/1332 | B: 635/1413 | C: 537/1511 [LOSS Ex1] A: 0.64440 | B: 0.62331 | C: 0.62227 [LOGITS Ex2 A] Mean Abs: 2.158 | Max: 6.053 [LOSS Ex2] A: 0.10896 | B: 0.33039 | C: 0.23933 ** [JOINT LOSS] ** : 0.856222 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003395 | Grad Max: 0.120065 -> Layer: shared_layers.0.bias | Grad Mean: 0.071377 | Grad Max: 0.448864 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.005741 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005791 | Grad Max: 0.005791 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000764 | Grad Max: 0.172814 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012093 | Grad Max: 0.970880 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.003039 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002993 | Grad Max: 0.027614 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000196 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000605 | Grad Max: 0.002633 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000066 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000147 | Grad Max: 0.000607 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000356 | Grad Max: 0.001075 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002982 | Grad Max: 0.002982 [GRADIENT NORM TOTAL] 2.4859 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.685 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.70592654 0.29407352] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.074 [MASKS] A(Pass/Fail): 709/1339 | B: 593/1263 | C: 543/1505 [LOSS Ex1] A: 0.64012 | B: 0.62762 | C: 0.62050 [LOGITS Ex2 A] Mean Abs: 2.135 | Max: 6.004 [LOSS Ex2] A: 0.13431 | B: 0.31725 | C: 0.22679 ** [JOINT LOSS] ** : 0.855529 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002886 | Grad Max: 0.077082 -> Layer: shared_layers.0.bias | Grad Mean: 0.134074 | Grad Max: 0.675476 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002168 | Grad Max: 0.006513 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000110 | Grad Max: 0.000110 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000962 | Grad Max: 0.413684 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016403 | Grad Max: 2.310817 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.002844 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002479 | Grad Max: 0.023660 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000135 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000408 | Grad Max: 0.002388 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000078 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000123 | Grad Max: 0.000738 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000390 | Grad Max: 0.001019 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001882 | Grad Max: 0.001882 [GRADIENT NORM TOTAL] 4.6541 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.793 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6251188 0.37488124] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.074 [MASKS] A(Pass/Fail): 588/1028 | B: 643/1405 | C: 510/1538 [LOSS Ex1] A: 0.63835 | B: 0.62684 | C: 0.62632 [LOGITS Ex2 A] Mean Abs: 2.203 | Max: 9.788 [LOSS Ex2] A: 0.12182 | B: 0.34347 | C: 0.25285 ** [JOINT LOSS] ** : 0.869886 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003486 | Grad Max: 0.104576 -> Layer: shared_layers.0.bias | Grad Mean: 0.143131 | Grad Max: 0.576087 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006128 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000117 | Grad Max: 0.000117 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001290 | Grad Max: 0.352313 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023192 | Grad Max: 1.994053 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000149 | Grad Max: 0.004731 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010591 | Grad Max: 0.055500 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000326 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002174 | Grad Max: 0.006324 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000123 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000510 | Grad Max: 0.001407 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000262 | Grad Max: 0.000994 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006779 | Grad Max: 0.006779 [GRADIENT NORM TOTAL] 4.3449 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.980 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50753254 0.49246752] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.074 [MASKS] A(Pass/Fail): 714/1334 | B: 632/1416 | C: 529/1519 [LOSS Ex1] A: 0.63897 | B: 0.62748 | C: 0.62497 [LOGITS Ex2 A] Mean Abs: 2.159 | Max: 8.713 [LOSS Ex2] A: 0.10929 | B: 0.34811 | C: 0.26088 ** [JOINT LOSS] ** : 0.869902 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003678 | Grad Max: 0.132777 -> Layer: shared_layers.0.bias | Grad Mean: 0.238907 | Grad Max: 1.305689 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005617 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000951 | Grad Max: 0.000951 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001506 | Grad Max: 0.436207 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026568 | Grad Max: 2.393966 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000128 | Grad Max: 0.006021 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008904 | Grad Max: 0.069545 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000223 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001604 | Grad Max: 0.004413 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000102 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000406 | Grad Max: 0.001453 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.001245 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008063 | Grad Max: 0.008063 [GRADIENT NORM TOTAL] 5.8689 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.925 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50980896 0.49019104] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.074 [MASKS] A(Pass/Fail): 708/1340 | B: 636/1412 | C: 536/1512 [LOSS Ex1] A: 0.63584 | B: 0.62310 | C: 0.62221 [LOGITS Ex2 A] Mean Abs: 2.131 | Max: 5.913 [LOSS Ex2] A: 0.12749 | B: 0.32674 | C: 0.23725 ** [JOINT LOSS] ** : 0.857542 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004356 | Grad Max: 0.159600 -> Layer: shared_layers.0.bias | Grad Mean: 0.141358 | Grad Max: 0.524654 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002293 | Grad Max: 0.006593 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003587 | Grad Max: 0.003587 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001130 | Grad Max: 0.132348 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018185 | Grad Max: 0.743231 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.006322 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004238 | Grad Max: 0.047662 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000139 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000514 | Grad Max: 0.002998 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000107 | Grad Max: 0.000735 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000318 | Grad Max: 0.001001 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001002 | Grad Max: 0.001002 [GRADIENT NORM TOTAL] 3.1682 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.954 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5054078 0.49459222] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.074 [MASKS] A(Pass/Fail): 715/1333 | B: 593/1263 | C: 516/1532 [LOSS Ex1] A: 0.63483 | B: 0.62740 | C: 0.62052 [LOGITS Ex2 A] Mean Abs: 2.137 | Max: 7.530 [LOSS Ex2] A: 0.13936 | B: 0.32259 | C: 0.24096 ** [JOINT LOSS] ** : 0.861887 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005779 | Grad Max: 0.232786 -> Layer: shared_layers.0.bias | Grad Mean: 0.202409 | Grad Max: 0.830745 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002251 | Grad Max: 0.006196 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001672 | Grad Max: 0.001672 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001482 | Grad Max: 0.308772 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025416 | Grad Max: 1.709780 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000149 | Grad Max: 0.006187 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010044 | Grad Max: 0.068996 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000248 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002248 | Grad Max: 0.005521 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000124 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000572 | Grad Max: 0.001645 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000442 | Grad Max: 0.001384 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010187 | Grad Max: 0.010187 [GRADIENT NORM TOTAL] 4.9830 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.789 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5017989 0.4982011] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.073 [MASKS] A(Pass/Fail): 682/1366 | B: 643/1405 | C: 559/1489 [LOSS Ex1] A: 0.64331 | B: 0.62663 | C: 0.61858 [LOGITS Ex2 A] Mean Abs: 2.105 | Max: 5.906 [LOSS Ex2] A: 0.11491 | B: 0.34067 | C: 0.22505 ** [JOINT LOSS] ** : 0.856381 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002772 | Grad Max: 0.094689 -> Layer: shared_layers.0.bias | Grad Mean: 0.093005 | Grad Max: 0.371983 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002162 | Grad Max: 0.006206 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008489 | Grad Max: 0.008489 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000761 | Grad Max: 0.189840 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012824 | Grad Max: 1.039045 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.003428 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002259 | Grad Max: 0.019764 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000150 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000442 | Grad Max: 0.003347 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000113 | Grad Max: 0.000784 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000455 | Grad Max: 0.001159 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001038 | Grad Max: 0.001038 [GRADIENT NORM TOTAL] 2.8074 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.122 | Max: 0.709 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54404527 0.4559547 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.070 [MASKS] A(Pass/Fail): 683/1365 | B: 632/1416 | C: 520/1528 [LOSS Ex1] A: 0.64350 | B: 0.62726 | C: 0.62199 [LOGITS Ex2 A] Mean Abs: 2.088 | Max: 6.667 [LOSS Ex2] A: 0.12463 | B: 0.33597 | C: 0.23442 ** [JOINT LOSS] ** : 0.862596 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004589 | Grad Max: 0.169253 -> Layer: shared_layers.0.bias | Grad Mean: 0.119903 | Grad Max: 0.829351 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005471 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003216 | Grad Max: 0.003216 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001003 | Grad Max: 0.338523 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017179 | Grad Max: 1.903362 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.004458 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005651 | Grad Max: 0.038873 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000181 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001306 | Grad Max: 0.004165 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000108 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000329 | Grad Max: 0.001016 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000397 | Grad Max: 0.001346 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006159 | Grad Max: 0.006159 [GRADIENT NORM TOTAL] 3.6282 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.888 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.76572686 0.2342731 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.074 [MASKS] A(Pass/Fail): 745/1303 | B: 636/1412 | C: 542/1506 [LOSS Ex1] A: 0.63740 | B: 0.62287 | C: 0.62041 [LOGITS Ex2 A] Mean Abs: 2.156 | Max: 6.798 [LOSS Ex2] A: 0.10866 | B: 0.32407 | C: 0.24640 ** [JOINT LOSS] ** : 0.853272 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002941 | Grad Max: 0.120071 -> Layer: shared_layers.0.bias | Grad Mean: 0.261785 | Grad Max: 1.572464 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002277 | Grad Max: 0.006240 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002723 | Grad Max: 0.002723 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001885 | Grad Max: 0.382930 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034298 | Grad Max: 2.133301 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000188 | Grad Max: 0.008681 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013839 | Grad Max: 0.095937 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000239 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002552 | Grad Max: 0.005712 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000116 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000635 | Grad Max: 0.001715 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000415 | Grad Max: 0.001550 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010148 | Grad Max: 0.010148 [GRADIENT NORM TOTAL] 6.4559 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.985 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007525 0.49924746] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.073 [MASKS] A(Pass/Fail): 716/1332 | B: 594/1262 | C: 528/1520 [LOSS Ex1] A: 0.64414 | B: 0.62718 | C: 0.62085 [LOGITS Ex2 A] Mean Abs: 2.147 | Max: 6.543 [LOSS Ex2] A: 0.10511 | B: 0.32550 | C: 0.23383 ** [JOINT LOSS] ** : 0.852205 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005105 | Grad Max: 0.216962 -> Layer: shared_layers.0.bias | Grad Mean: 0.117907 | Grad Max: 0.410725 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.005352 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003552 | Grad Max: 0.003552 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000979 | Grad Max: 0.217107 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015268 | Grad Max: 1.114668 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.003283 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002678 | Grad Max: 0.024314 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000184 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000493 | Grad Max: 0.002906 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000075 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000133 | Grad Max: 0.000732 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000442 | Grad Max: 0.001087 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001181 | Grad Max: 0.001181 [GRADIENT NORM TOTAL] 3.1372 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.691 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7069754 0.29302457] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.074 [MASKS] A(Pass/Fail): 709/1339 | B: 643/1405 | C: 527/1521 [LOSS Ex1] A: 0.63985 | B: 0.62641 | C: 0.62378 [LOGITS Ex2 A] Mean Abs: 2.130 | Max: 5.833 [LOSS Ex2] A: 0.13155 | B: 0.34427 | C: 0.23998 ** [JOINT LOSS] ** : 0.868611 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005086 | Grad Max: 0.146817 -> Layer: shared_layers.0.bias | Grad Mean: 0.370512 | Grad Max: 1.976229 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.006333 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007205 | Grad Max: 0.007205 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.708533 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041581 | Grad Max: 3.948724 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000248 | Grad Max: 0.008499 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018312 | Grad Max: 0.104553 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000325 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003679 | Grad Max: 0.008088 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000168 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000912 | Grad Max: 0.002298 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000662 | Grad Max: 0.001742 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015748 | Grad Max: 0.015748 [GRADIENT NORM TOTAL] 8.9699 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.799 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6256767 0.37432334] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.074 [MASKS] A(Pass/Fail): 588/1028 | B: 632/1416 | C: 534/1514 [LOSS Ex1] A: 0.63808 | B: 0.62705 | C: 0.61909 [LOGITS Ex2 A] Mean Abs: 2.209 | Max: 7.999 [LOSS Ex2] A: 0.11859 | B: 0.33739 | C: 0.23940 ** [JOINT LOSS] ** : 0.859863 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003507 | Grad Max: 0.135279 -> Layer: shared_layers.0.bias | Grad Mean: 0.211251 | Grad Max: 0.934273 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.006385 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008261 | Grad Max: 0.008261 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001143 | Grad Max: 0.585912 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019585 | Grad Max: 3.274876 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.003755 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003484 | Grad Max: 0.035815 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000131 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000460 | Grad Max: 0.002661 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000057 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000108 | Grad Max: 0.000600 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000302 | Grad Max: 0.000889 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001728 | Grad Max: 0.001728 [GRADIENT NORM TOTAL] 6.0334 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.987 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5075193 0.49248073] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 714/1334 | B: 636/1412 | C: 565/1483 [LOSS Ex1] A: 0.63870 | B: 0.62266 | C: 0.61626 [LOGITS Ex2 A] Mean Abs: 2.201 | Max: 8.188 [LOSS Ex2] A: 0.11567 | B: 0.32072 | C: 0.24757 ** [JOINT LOSS] ** : 0.853865 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009430 | Grad Max: 0.390026 -> Layer: shared_layers.0.bias | Grad Mean: 0.406245 | Grad Max: 1.301687 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002326 | Grad Max: 0.005647 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000231 | Grad Max: 0.000231 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002881 | Grad Max: 0.405355 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051154 | Grad Max: 2.269688 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000356 | Grad Max: 0.011704 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025557 | Grad Max: 0.128815 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000484 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005467 | Grad Max: 0.011713 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000238 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001342 | Grad Max: 0.003126 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000907 | Grad Max: 0.002311 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021890 | Grad Max: 0.021890 [GRADIENT NORM TOTAL] 8.4429 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.931 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50978655 0.49021348] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.074 [MASKS] A(Pass/Fail): 708/1340 | B: 594/1262 | C: 349/1027 [LOSS Ex1] A: 0.63555 | B: 0.62698 | C: 0.62340 [LOGITS Ex2 A] Mean Abs: 2.197 | Max: 6.716 [LOSS Ex2] A: 0.12434 | B: 0.32359 | C: 0.24706 ** [JOINT LOSS] ** : 0.860309 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006341 | Grad Max: 0.260362 -> Layer: shared_layers.0.bias | Grad Mean: 0.104399 | Grad Max: 0.364072 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002264 | Grad Max: 0.006650 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006796 | Grad Max: 0.006796 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001127 | Grad Max: 0.268936 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018441 | Grad Max: 1.390810 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.006487 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008124 | Grad Max: 0.057056 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000273 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001952 | Grad Max: 0.005241 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000105 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000524 | Grad Max: 0.001424 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000427 | Grad Max: 0.001349 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009576 | Grad Max: 0.009576 [GRADIENT NORM TOTAL] 3.0979 [EPOCH SUMMARY] Train Loss: 0.8599 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8481 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 131/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5054642 0.4945358] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.074 [MASKS] A(Pass/Fail): 715/1333 | B: 644/1404 | C: 547/1501 [LOSS Ex1] A: 0.63454 | B: 0.62621 | C: 0.61786 [LOGITS Ex2 A] Mean Abs: 2.132 | Max: 6.579 [LOSS Ex2] A: 0.12677 | B: 0.36912 | C: 0.22496 ** [JOINT LOSS] ** : 0.866490 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005200 | Grad Max: 0.228726 -> Layer: shared_layers.0.bias | Grad Mean: 0.605777 | Grad Max: 3.078303 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002296 | Grad Max: 0.006558 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002464 | Grad Max: 0.002464 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003947 | Grad Max: 0.655362 -> Layer: exit2_layers.0.bias | Grad Mean: 0.073270 | Grad Max: 3.653754 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000489 | Grad Max: 0.021675 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036749 | Grad Max: 0.220992 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000602 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007113 | Grad Max: 0.014594 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000265 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001764 | Grad Max: 0.004001 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001192 | Grad Max: 0.002254 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030424 | Grad Max: 0.030424 [GRADIENT NORM TOTAL] 13.6787 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.794 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50171345 0.49828658] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.073 [MASKS] A(Pass/Fail): 683/1365 | B: 633/1415 | C: 519/1529 [LOSS Ex1] A: 0.64304 | B: 0.62685 | C: 0.61898 [LOGITS Ex2 A] Mean Abs: 2.102 | Max: 6.423 [LOSS Ex2] A: 0.11743 | B: 0.36989 | C: 0.24620 ** [JOINT LOSS] ** : 0.874128 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008806 | Grad Max: 0.209014 -> Layer: shared_layers.0.bias | Grad Mean: 0.666289 | Grad Max: 2.630035 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.005859 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006311 | Grad Max: 0.006311 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004402 | Grad Max: 0.546635 -> Layer: exit2_layers.0.bias | Grad Mean: 0.081938 | Grad Max: 3.068626 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000598 | Grad Max: 0.021288 -> Layer: exit2_layers.3.bias | Grad Mean: 0.044514 | Grad Max: 0.249931 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000739 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009014 | Grad Max: 0.018322 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000380 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002256 | Grad Max: 0.005372 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001668 | Grad Max: 0.003164 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040131 | Grad Max: 0.040131 [GRADIENT NORM TOTAL] 13.8758 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.714 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54405946 0.45594054] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.071 [MASKS] A(Pass/Fail): 683/1365 | B: 636/1412 | C: 519/1529 [LOSS Ex1] A: 0.64326 | B: 0.62246 | C: 0.62469 [LOGITS Ex2 A] Mean Abs: 2.129 | Max: 5.715 [LOSS Ex2] A: 0.12426 | B: 0.33117 | C: 0.25254 ** [JOINT LOSS] ** : 0.866126 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006761 | Grad Max: 0.259019 -> Layer: shared_layers.0.bias | Grad Mean: 0.205778 | Grad Max: 0.869356 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.005602 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006855 | Grad Max: 0.006855 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001669 | Grad Max: 0.395064 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029535 | Grad Max: 2.219909 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000205 | Grad Max: 0.006514 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014543 | Grad Max: 0.075817 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000348 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003221 | Grad Max: 0.007582 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000172 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000807 | Grad Max: 0.002187 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000658 | Grad Max: 0.001661 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014131 | Grad Max: 0.014131 [GRADIENT NORM TOTAL] 5.0476 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.894 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.76713264 0.23286738] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.075 [MASKS] A(Pass/Fail): 745/1303 | B: 594/1262 | C: 510/1538 [LOSS Ex1] A: 0.63714 | B: 0.62679 | C: 0.62201 [LOGITS Ex2 A] Mean Abs: 2.226 | Max: 6.699 [LOSS Ex2] A: 0.12704 | B: 0.33620 | C: 0.26113 ** [JOINT LOSS] ** : 0.870100 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006006 | Grad Max: 0.308051 -> Layer: shared_layers.0.bias | Grad Mean: 0.761881 | Grad Max: 3.888122 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002224 | Grad Max: 0.006292 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005207 | Grad Max: 0.005207 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004953 | Grad Max: 0.896494 -> Layer: exit2_layers.0.bias | Grad Mean: 0.092905 | Grad Max: 5.000305 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.021076 -> Layer: exit2_layers.3.bias | Grad Mean: 0.046134 | Grad Max: 0.272610 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000760 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009007 | Grad Max: 0.019214 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000324 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002232 | Grad Max: 0.005075 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001484 | Grad Max: 0.002941 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037505 | Grad Max: 0.037505 [GRADIENT NORM TOTAL] 17.5215 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.991 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007691 0.49923092] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.073 [MASKS] A(Pass/Fail): 717/1331 | B: 644/1404 | C: 545/1503 [LOSS Ex1] A: 0.64391 | B: 0.62603 | C: 0.61906 [LOGITS Ex2 A] Mean Abs: 2.248 | Max: 5.655 [LOSS Ex2] A: 0.11856 | B: 0.38451 | C: 0.21704 ** [JOINT LOSS] ** : 0.869704 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007723 | Grad Max: 0.427495 -> Layer: shared_layers.0.bias | Grad Mean: 1.094741 | Grad Max: 5.554728 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002114 | Grad Max: 0.005599 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004117 | Grad Max: 0.004117 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007040 | Grad Max: 1.120743 -> Layer: exit2_layers.0.bias | Grad Mean: 0.132081 | Grad Max: 6.234292 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000884 | Grad Max: 0.034544 -> Layer: exit2_layers.3.bias | Grad Mean: 0.066979 | Grad Max: 0.401497 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000106 | Grad Max: 0.001110 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013130 | Grad Max: 0.026967 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000499 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003253 | Grad Max: 0.007453 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002164 | Grad Max: 0.003874 -> Layer: exit2_layers.12.bias | Grad Mean: 0.055047 | Grad Max: 0.055047 [GRADIENT NORM TOTAL] 24.6067 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.695 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.70786875 0.29213125] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.074 [MASKS] A(Pass/Fail): 710/1338 | B: 633/1415 | C: 535/1513 [LOSS Ex1] A: 0.63961 | B: 0.62668 | C: 0.62456 [LOGITS Ex2 A] Mean Abs: 2.229 | Max: 6.359 [LOSS Ex2] A: 0.13537 | B: 0.37197 | C: 0.29248 ** [JOINT LOSS] ** : 0.896889 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006680 | Grad Max: 0.364169 -> Layer: shared_layers.0.bias | Grad Mean: 0.976795 | Grad Max: 4.846680 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005380 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006067 | Grad Max: 0.006067 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006230 | Grad Max: 1.077711 -> Layer: exit2_layers.0.bias | Grad Mean: 0.116566 | Grad Max: 6.015776 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000776 | Grad Max: 0.031053 -> Layer: exit2_layers.3.bias | Grad Mean: 0.058911 | Grad Max: 0.349642 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.000983 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011438 | Grad Max: 0.023892 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000456 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002768 | Grad Max: 0.006921 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001792 | Grad Max: 0.003597 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045057 | Grad Max: 0.045057 [GRADIENT NORM TOTAL] 22.1342 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.804 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62613225 0.37386772] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.075 [MASKS] A(Pass/Fail): 588/1028 | B: 636/1412 | C: 558/1490 [LOSS Ex1] A: 0.63785 | B: 0.62231 | C: 0.61823 [LOGITS Ex2 A] Mean Abs: 2.250 | Max: 6.558 [LOSS Ex2] A: 0.12172 | B: 0.31792 | C: 0.24101 ** [JOINT LOSS] ** : 0.853014 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004760 | Grad Max: 0.167603 -> Layer: shared_layers.0.bias | Grad Mean: 0.304874 | Grad Max: 1.498444 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002239 | Grad Max: 0.006018 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003973 | Grad Max: 0.003973 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001972 | Grad Max: 0.461089 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034482 | Grad Max: 2.591306 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000179 | Grad Max: 0.007730 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012775 | Grad Max: 0.090520 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000238 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002283 | Grad Max: 0.005934 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000107 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000552 | Grad Max: 0.001543 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000325 | Grad Max: 0.001051 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008340 | Grad Max: 0.008340 [GRADIENT NORM TOTAL] 7.1397 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.993 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.507444 0.49255595] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.075 [MASKS] A(Pass/Fail): 716/1332 | B: 594/1262 | C: 551/1497 [LOSS Ex1] A: 0.63848 | B: 0.62664 | C: 0.62190 [LOGITS Ex2 A] Mean Abs: 2.173 | Max: 7.412 [LOSS Ex2] A: 0.12079 | B: 0.35368 | C: 0.24047 ** [JOINT LOSS] ** : 0.867321 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011709 | Grad Max: 0.343394 -> Layer: shared_layers.0.bias | Grad Mean: 0.743006 | Grad Max: 3.109020 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002174 | Grad Max: 0.005945 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001950 | Grad Max: 0.001950 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005051 | Grad Max: 0.592270 -> Layer: exit2_layers.0.bias | Grad Mean: 0.092833 | Grad Max: 3.284600 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000682 | Grad Max: 0.025994 -> Layer: exit2_layers.3.bias | Grad Mean: 0.050605 | Grad Max: 0.295520 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000803 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010403 | Grad Max: 0.020617 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000446 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002587 | Grad Max: 0.006358 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001784 | Grad Max: 0.003245 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043961 | Grad Max: 0.043961 [GRADIENT NORM TOTAL] 15.2100 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.936 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5097895 0.49021047] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 709/1339 | B: 644/1404 | C: 530/1518 [LOSS Ex1] A: 0.63533 | B: 0.62590 | C: 0.62192 [LOGITS Ex2 A] Mean Abs: 2.163 | Max: 6.707 [LOSS Ex2] A: 0.12535 | B: 0.39678 | C: 0.24551 ** [JOINT LOSS] ** : 0.883596 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009918 | Grad Max: 0.326858 -> Layer: shared_layers.0.bias | Grad Mean: 1.010631 | Grad Max: 4.344743 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.006561 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001310 | Grad Max: 0.001310 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006642 | Grad Max: 1.052877 -> Layer: exit2_layers.0.bias | Grad Mean: 0.123838 | Grad Max: 5.850938 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000838 | Grad Max: 0.032651 -> Layer: exit2_layers.3.bias | Grad Mean: 0.063493 | Grad Max: 0.368448 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000104 | Grad Max: 0.001056 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012842 | Grad Max: 0.026088 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000513 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003201 | Grad Max: 0.007826 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002181 | Grad Max: 0.004371 -> Layer: exit2_layers.12.bias | Grad Mean: 0.054662 | Grad Max: 0.054662 [GRADIENT NORM TOTAL] 22.0122 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.966 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.505483 0.49451706] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.074 [MASKS] A(Pass/Fail): 715/1333 | B: 633/1415 | C: 562/1486 [LOSS Ex1] A: 0.63433 | B: 0.62656 | C: 0.62178 [LOGITS Ex2 A] Mean Abs: 2.126 | Max: 8.149 [LOSS Ex2] A: 0.13212 | B: 0.37102 | C: 0.24388 ** [JOINT LOSS] ** : 0.876558 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006164 | Grad Max: 0.271388 -> Layer: shared_layers.0.bias | Grad Mean: 0.757032 | Grad Max: 3.488138 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.006398 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004747 | Grad Max: 0.004747 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004671 | Grad Max: 0.849158 -> Layer: exit2_layers.0.bias | Grad Mean: 0.087697 | Grad Max: 4.710185 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000603 | Grad Max: 0.024559 -> Layer: exit2_layers.3.bias | Grad Mean: 0.046038 | Grad Max: 0.282265 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000700 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009094 | Grad Max: 0.018066 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000355 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002289 | Grad Max: 0.005494 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001565 | Grad Max: 0.002992 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040048 | Grad Max: 0.040048 [GRADIENT NORM TOTAL] 16.2859 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.798 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016482 0.49835178] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.073 [MASKS] A(Pass/Fail): 684/1364 | B: 636/1412 | C: 583/1465 [LOSS Ex1] A: 0.64285 | B: 0.62219 | C: 0.61443 [LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.064 [LOSS Ex2] A: 0.11318 | B: 0.32249 | C: 0.23589 ** [JOINT LOSS] ** : 0.850342 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005497 | Grad Max: 0.162857 -> Layer: shared_layers.0.bias | Grad Mean: 0.217896 | Grad Max: 0.928172 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002244 | Grad Max: 0.006809 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012153 | Grad Max: 0.012153 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001612 | Grad Max: 0.515708 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029305 | Grad Max: 2.863945 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.006451 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012210 | Grad Max: 0.070088 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000291 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002628 | Grad Max: 0.006695 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000131 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000636 | Grad Max: 0.001670 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000423 | Grad Max: 0.001427 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009928 | Grad Max: 0.009928 [GRADIENT NORM TOTAL] 5.7190 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.717 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439659 0.4560341] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.071 [MASKS] A(Pass/Fail): 683/1365 | B: 595/1261 | C: 543/1505 [LOSS Ex1] A: 0.64308 | B: 0.62653 | C: 0.61821 [LOGITS Ex2 A] Mean Abs: 2.161 | Max: 5.585 [LOSS Ex2] A: 0.12486 | B: 0.32376 | C: 0.24129 ** [JOINT LOSS] ** : 0.859243 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008373 | Grad Max: 0.232125 -> Layer: shared_layers.0.bias | Grad Mean: 0.445471 | Grad Max: 1.822768 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005799 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004619 | Grad Max: 0.004619 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003019 | Grad Max: 0.413182 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055211 | Grad Max: 2.318281 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000369 | Grad Max: 0.014503 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027135 | Grad Max: 0.168504 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000515 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005668 | Grad Max: 0.012512 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000251 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001413 | Grad Max: 0.003416 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001022 | Grad Max: 0.002502 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024536 | Grad Max: 0.024536 [GRADIENT NORM TOTAL] 9.4138 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.897 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.76804686 0.23195307] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.075 [MASKS] A(Pass/Fail): 745/1303 | B: 644/1404 | C: 561/1487 [LOSS Ex1] A: 0.63695 | B: 0.62580 | C: 0.62294 [LOGITS Ex2 A] Mean Abs: 2.180 | Max: 6.880 [LOSS Ex2] A: 0.11732 | B: 0.33878 | C: 0.26307 ** [JOINT LOSS] ** : 0.868288 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007626 | Grad Max: 0.258203 -> Layer: shared_layers.0.bias | Grad Mean: 0.259585 | Grad Max: 1.171643 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.006400 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005014 | Grad Max: 0.005014 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001995 | Grad Max: 0.341912 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035156 | Grad Max: 1.872692 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000266 | Grad Max: 0.010168 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018878 | Grad Max: 0.115104 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000379 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004061 | Grad Max: 0.008820 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000186 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001013 | Grad Max: 0.002544 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000696 | Grad Max: 0.001660 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016386 | Grad Max: 0.016386 [GRADIENT NORM TOTAL] 5.7719 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.995 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007297 0.49927035] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.074 [MASKS] A(Pass/Fail): 717/1331 | B: 633/1415 | C: 348/1028 [LOSS Ex1] A: 0.64373 | B: 0.62645 | C: 0.62796 [LOGITS Ex2 A] Mean Abs: 2.129 | Max: 6.268 [LOSS Ex2] A: 0.10989 | B: 0.35500 | C: 0.25062 ** [JOINT LOSS] ** : 0.871219 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005532 | Grad Max: 0.167078 -> Layer: shared_layers.0.bias | Grad Mean: 0.492459 | Grad Max: 2.107852 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.005974 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010693 | Grad Max: 0.010693 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003216 | Grad Max: 0.444117 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059802 | Grad Max: 2.494663 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000403 | Grad Max: 0.014627 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030395 | Grad Max: 0.168913 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000535 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006147 | Grad Max: 0.012922 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000235 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001548 | Grad Max: 0.003520 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001100 | Grad Max: 0.002218 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027291 | Grad Max: 0.027291 [GRADIENT NORM TOTAL] 10.5824 [EPOCH SUMMARY] Train Loss: 0.8695 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8521 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 132/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.698 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.70847976 0.29152027] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.075 [MASKS] A(Pass/Fail): 710/1338 | B: 636/1412 | C: 536/1512 [LOSS Ex1] A: 0.63943 | B: 0.62208 | C: 0.62083 [LOGITS Ex2 A] Mean Abs: 2.095 | Max: 6.435 [LOSS Ex2] A: 0.13600 | B: 0.35074 | C: 0.24053 ** [JOINT LOSS] ** : 0.869872 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009998 | Grad Max: 0.268569 -> Layer: shared_layers.0.bias | Grad Mean: 0.678049 | Grad Max: 2.774681 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.006312 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002540 | Grad Max: 0.002540 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004383 | Grad Max: 0.585236 -> Layer: exit2_layers.0.bias | Grad Mean: 0.081876 | Grad Max: 3.172708 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000594 | Grad Max: 0.020704 -> Layer: exit2_layers.3.bias | Grad Mean: 0.044458 | Grad Max: 0.251811 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000706 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008927 | Grad Max: 0.017692 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000376 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002227 | Grad Max: 0.005474 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001532 | Grad Max: 0.002888 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037863 | Grad Max: 0.037863 [GRADIENT NORM TOTAL] 13.7005 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.807 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62640864 0.37359133] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.075 [MASKS] A(Pass/Fail): 588/1028 | B: 595/1261 | C: 537/1511 [LOSS Ex1] A: 0.63767 | B: 0.62643 | C: 0.62431 [LOGITS Ex2 A] Mean Abs: 2.149 | Max: 9.479 [LOSS Ex2] A: 0.12142 | B: 0.33094 | C: 0.24996 ** [JOINT LOSS] ** : 0.863576 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006406 | Grad Max: 0.196214 -> Layer: shared_layers.0.bias | Grad Mean: 0.301290 | Grad Max: 1.408984 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.005539 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006383 | Grad Max: 0.006383 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.249072 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037911 | Grad Max: 1.275433 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.012288 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022525 | Grad Max: 0.137987 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000439 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004621 | Grad Max: 0.010255 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000204 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001163 | Grad Max: 0.002910 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000864 | Grad Max: 0.002148 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020155 | Grad Max: 0.020155 [GRADIENT NORM TOTAL] 6.1769 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.996 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50749743 0.49250254] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.075 [MASKS] A(Pass/Fail): 716/1332 | B: 644/1404 | C: 526/1522 [LOSS Ex1] A: 0.63831 | B: 0.62570 | C: 0.62579 [LOGITS Ex2 A] Mean Abs: 2.199 | Max: 7.979 [LOSS Ex2] A: 0.11365 | B: 0.35558 | C: 0.27410 ** [JOINT LOSS] ** : 0.877709 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005207 | Grad Max: 0.283034 -> Layer: shared_layers.0.bias | Grad Mean: 0.768818 | Grad Max: 3.789115 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005893 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000079 | Grad Max: 0.000079 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004835 | Grad Max: 0.837824 -> Layer: exit2_layers.0.bias | Grad Mean: 0.090562 | Grad Max: 4.668731 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000617 | Grad Max: 0.022483 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047222 | Grad Max: 0.261635 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000788 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009352 | Grad Max: 0.018615 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000360 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002309 | Grad Max: 0.005497 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001423 | Grad Max: 0.002754 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037430 | Grad Max: 0.037430 [GRADIENT NORM TOTAL] 16.9620 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.940 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5096875 0.49031255] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 709/1339 | B: 633/1415 | C: 561/1487 [LOSS Ex1] A: 0.63517 | B: 0.62635 | C: 0.61806 [LOGITS Ex2 A] Mean Abs: 2.210 | Max: 6.627 [LOSS Ex2] A: 0.13309 | B: 0.39149 | C: 0.27979 ** [JOINT LOSS] ** : 0.894647 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009642 | Grad Max: 0.414203 -> Layer: shared_layers.0.bias | Grad Mean: 1.228036 | Grad Max: 5.336411 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.006537 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003409 | Grad Max: 0.003409 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007656 | Grad Max: 1.341062 -> Layer: exit2_layers.0.bias | Grad Mean: 0.143661 | Grad Max: 7.487618 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000985 | Grad Max: 0.040052 -> Layer: exit2_layers.3.bias | Grad Mean: 0.075725 | Grad Max: 0.453847 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000121 | Grad Max: 0.001234 -> Layer: exit2_layers.6.bias | Grad Mean: 0.015211 | Grad Max: 0.031914 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000590 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003805 | Grad Max: 0.009365 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002510 | Grad Max: 0.005102 -> Layer: exit2_layers.12.bias | Grad Mean: 0.064373 | Grad Max: 0.064373 [GRADIENT NORM TOTAL] 26.2670 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.969 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50556856 0.49443144] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 715/1333 | B: 636/1412 | C: 555/1493 [LOSS Ex1] A: 0.63417 | B: 0.62199 | C: 0.61971 [LOGITS Ex2 A] Mean Abs: 2.176 | Max: 7.467 [LOSS Ex2] A: 0.14177 | B: 0.35442 | C: 0.25262 ** [JOINT LOSS] ** : 0.874895 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010185 | Grad Max: 0.355247 -> Layer: shared_layers.0.bias | Grad Mean: 1.084433 | Grad Max: 4.566019 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002354 | Grad Max: 0.006414 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004761 | Grad Max: 0.004761 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006919 | Grad Max: 1.110409 -> Layer: exit2_layers.0.bias | Grad Mean: 0.129189 | Grad Max: 6.172784 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000904 | Grad Max: 0.035714 -> Layer: exit2_layers.3.bias | Grad Mean: 0.068784 | Grad Max: 0.414244 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001085 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013878 | Grad Max: 0.027908 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000508 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003498 | Grad Max: 0.007838 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002303 | Grad Max: 0.004216 -> Layer: exit2_layers.12.bias | Grad Mean: 0.059194 | Grad Max: 0.059194 [GRADIENT NORM TOTAL] 22.9894 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.801 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50166225 0.4983377 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.073 [MASKS] A(Pass/Fail): 684/1364 | B: 596/1260 | C: 564/1484 [LOSS Ex1] A: 0.64270 | B: 0.62634 | C: 0.61523 [LOGITS Ex2 A] Mean Abs: 2.114 | Max: 6.653 [LOSS Ex2] A: 0.11290 | B: 0.32188 | C: 0.22701 ** [JOINT LOSS] ** : 0.848684 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002835 | Grad Max: 0.092945 -> Layer: shared_layers.0.bias | Grad Mean: 0.243710 | Grad Max: 1.111518 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.005834 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000472 | Grad Max: 0.000472 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001775 | Grad Max: 0.438802 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032200 | Grad Max: 2.453110 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000185 | Grad Max: 0.008347 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014137 | Grad Max: 0.093400 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000250 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002751 | Grad Max: 0.006075 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000135 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000727 | Grad Max: 0.001865 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000509 | Grad Max: 0.001732 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012997 | Grad Max: 0.012997 [GRADIENT NORM TOTAL] 6.3057 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.720 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5440134 0.4559866] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.071 [MASKS] A(Pass/Fail): 683/1365 | B: 646/1402 | C: 554/1494 [LOSS Ex1] A: 0.64294 | B: 0.62562 | C: 0.62132 [LOGITS Ex2 A] Mean Abs: 2.033 | Max: 6.360 [LOSS Ex2] A: 0.12643 | B: 0.36622 | C: 0.25792 ** [JOINT LOSS] ** : 0.880148 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010411 | Grad Max: 0.267635 -> Layer: shared_layers.0.bias | Grad Mean: 0.789341 | Grad Max: 3.667359 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.006595 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013742 | Grad Max: 0.013742 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005078 | Grad Max: 0.716664 -> Layer: exit2_layers.0.bias | Grad Mean: 0.094715 | Grad Max: 4.043537 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000639 | Grad Max: 0.021948 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048181 | Grad Max: 0.277578 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000806 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009878 | Grad Max: 0.018903 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000359 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002519 | Grad Max: 0.005779 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001771 | Grad Max: 0.003130 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044228 | Grad Max: 0.044228 [GRADIENT NORM TOTAL] 16.9970 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.900 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.76881343 0.23118658] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.075 [MASKS] A(Pass/Fail): 745/1303 | B: 633/1415 | C: 572/1476 [LOSS Ex1] A: 0.63681 | B: 0.62627 | C: 0.61591 [LOGITS Ex2 A] Mean Abs: 2.037 | Max: 6.377 [LOSS Ex2] A: 0.12577 | B: 0.39212 | C: 0.24864 ** [JOINT LOSS] ** : 0.881843 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010212 | Grad Max: 0.344785 -> Layer: shared_layers.0.bias | Grad Mean: 1.042530 | Grad Max: 4.641877 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.006140 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002091 | Grad Max: 0.002091 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006557 | Grad Max: 0.802334 -> Layer: exit2_layers.0.bias | Grad Mean: 0.123613 | Grad Max: 4.535263 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000856 | Grad Max: 0.031929 -> Layer: exit2_layers.3.bias | Grad Mean: 0.065524 | Grad Max: 0.366519 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000107 | Grad Max: 0.001057 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013255 | Grad Max: 0.027004 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000520 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003340 | Grad Max: 0.008082 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002207 | Grad Max: 0.004202 -> Layer: exit2_layers.12.bias | Grad Mean: 0.056527 | Grad Max: 0.056527 [GRADIENT NORM TOTAL] 22.2275 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.998 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50069433 0.49930567] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.074 [MASKS] A(Pass/Fail): 717/1331 | B: 637/1411 | C: 595/1453 [LOSS Ex1] A: 0.64361 | B: 0.62191 | C: 0.61361 [LOGITS Ex2 A] Mean Abs: 2.060 | Max: 6.301 [LOSS Ex2] A: 0.10714 | B: 0.36859 | C: 0.25087 ** [JOINT LOSS] ** : 0.868574 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008627 | Grad Max: 0.284707 -> Layer: shared_layers.0.bias | Grad Mean: 0.867000 | Grad Max: 3.791410 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.005913 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005372 | Grad Max: 0.005372 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005457 | Grad Max: 0.973554 -> Layer: exit2_layers.0.bias | Grad Mean: 0.102103 | Grad Max: 5.415605 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000692 | Grad Max: 0.028324 -> Layer: exit2_layers.3.bias | Grad Mean: 0.052829 | Grad Max: 0.332604 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000815 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010706 | Grad Max: 0.021393 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000409 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002693 | Grad Max: 0.006475 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001789 | Grad Max: 0.003396 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046071 | Grad Max: 0.046071 [GRADIENT NORM TOTAL] 18.7556 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.700 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7089249 0.29107514] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.075 [MASKS] A(Pass/Fail): 711/1337 | B: 597/1259 | C: 518/1530 [LOSS Ex1] A: 0.63930 | B: 0.62626 | C: 0.62546 [LOGITS Ex2 A] Mean Abs: 2.082 | Max: 6.131 [LOSS Ex2] A: 0.12827 | B: 0.33235 | C: 0.25198 ** [JOINT LOSS] ** : 0.867877 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003138 | Grad Max: 0.125322 -> Layer: shared_layers.0.bias | Grad Mean: 0.346150 | Grad Max: 1.662324 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005751 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000918 | Grad Max: 0.000918 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001977 | Grad Max: 0.564175 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036423 | Grad Max: 3.154107 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000212 | Grad Max: 0.008660 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015903 | Grad Max: 0.084697 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000323 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003105 | Grad Max: 0.007037 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000170 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000784 | Grad Max: 0.002449 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000514 | Grad Max: 0.001616 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013247 | Grad Max: 0.013247 [GRADIENT NORM TOTAL] 7.7391 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.810 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6265472 0.37345278] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.075 [MASKS] A(Pass/Fail): 588/1028 | B: 646/1402 | C: 530/1518 [LOSS Ex1] A: 0.63755 | B: 0.62554 | C: 0.62238 [LOGITS Ex2 A] Mean Abs: 2.178 | Max: 8.044 [LOSS Ex2] A: 0.13266 | B: 0.35318 | C: 0.24817 ** [JOINT LOSS] ** : 0.873161 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008757 | Grad Max: 0.232645 -> Layer: shared_layers.0.bias | Grad Mean: 0.688699 | Grad Max: 2.995794 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.005636 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006419 | Grad Max: 0.006419 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004521 | Grad Max: 0.582163 -> Layer: exit2_layers.0.bias | Grad Mean: 0.084143 | Grad Max: 3.250406 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000596 | Grad Max: 0.019259 -> Layer: exit2_layers.3.bias | Grad Mean: 0.045278 | Grad Max: 0.237821 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000759 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009219 | Grad Max: 0.018623 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000388 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002323 | Grad Max: 0.005856 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001484 | Grad Max: 0.003157 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038471 | Grad Max: 0.038471 [GRADIENT NORM TOTAL] 14.6503 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.999 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50749314 0.4925069 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.075 [MASKS] A(Pass/Fail): 716/1332 | B: 633/1415 | C: 554/1494 [LOSS Ex1] A: 0.63820 | B: 0.62620 | C: 0.61710 [LOGITS Ex2 A] Mean Abs: 2.149 | Max: 6.922 [LOSS Ex2] A: 0.13242 | B: 0.36986 | C: 0.26065 ** [JOINT LOSS] ** : 0.881473 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014401 | Grad Max: 0.389565 -> Layer: shared_layers.0.bias | Grad Mean: 1.107231 | Grad Max: 4.852305 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002230 | Grad Max: 0.006027 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004771 | Grad Max: 0.004771 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007325 | Grad Max: 0.957052 -> Layer: exit2_layers.0.bias | Grad Mean: 0.136072 | Grad Max: 5.371665 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000942 | Grad Max: 0.033127 -> Layer: exit2_layers.3.bias | Grad Mean: 0.071209 | Grad Max: 0.390454 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000118 | Grad Max: 0.001129 -> Layer: exit2_layers.6.bias | Grad Mean: 0.014628 | Grad Max: 0.028681 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000533 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003726 | Grad Max: 0.008557 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002427 | Grad Max: 0.004576 -> Layer: exit2_layers.12.bias | Grad Mean: 0.062607 | Grad Max: 0.062607 [GRADIENT NORM TOTAL] 23.7592 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.942 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50961703 0.49038297] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 709/1339 | B: 637/1411 | C: 529/1519 [LOSS Ex1] A: 0.63505 | B: 0.62184 | C: 0.62185 [LOGITS Ex2 A] Mean Abs: 2.160 | Max: 6.221 [LOSS Ex2] A: 0.13481 | B: 0.34087 | C: 0.23282 ** [JOINT LOSS] ** : 0.862416 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011932 | Grad Max: 0.300555 -> Layer: shared_layers.0.bias | Grad Mean: 0.863655 | Grad Max: 3.741196 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002323 | Grad Max: 0.006029 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003994 | Grad Max: 0.003994 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005653 | Grad Max: 0.806187 -> Layer: exit2_layers.0.bias | Grad Mean: 0.104908 | Grad Max: 4.514004 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000741 | Grad Max: 0.025928 -> Layer: exit2_layers.3.bias | Grad Mean: 0.056100 | Grad Max: 0.327325 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.000900 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011583 | Grad Max: 0.023211 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000451 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002976 | Grad Max: 0.006632 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001927 | Grad Max: 0.003914 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050166 | Grad Max: 0.050166 [GRADIENT NORM TOTAL] 18.4110 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.972 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50559366 0.49440628] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 715/1333 | B: 598/1258 | C: 354/1022 [LOSS Ex1] A: 0.63406 | B: 0.62619 | C: 0.62784 [LOGITS Ex2 A] Mean Abs: 2.080 | Max: 8.552 [LOSS Ex2] A: 0.13176 | B: 0.32188 | C: 0.24499 ** [JOINT LOSS] ** : 0.862240 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003936 | Grad Max: 0.157110 -> Layer: shared_layers.0.bias | Grad Mean: 0.105470 | Grad Max: 0.589201 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.007275 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000149 | Grad Max: 0.000149 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000953 | Grad Max: 0.244537 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016207 | Grad Max: 1.354697 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.003558 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004512 | Grad Max: 0.033995 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001050 | Grad Max: 0.003545 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000085 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000258 | Grad Max: 0.000851 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000420 | Grad Max: 0.001199 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003823 | Grad Max: 0.003823 [GRADIENT NORM TOTAL] 3.2813 [EPOCH SUMMARY] Train Loss: 0.8719 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8570 | Alpha: 0.5500 No improve count: 3/15 ############################## EPOCH 133/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.803 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016183 0.49838167] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.074 [MASKS] A(Pass/Fail): 684/1364 | B: 647/1401 | C: 566/1482 [LOSS Ex1] A: 0.64260 | B: 0.62547 | C: 0.62162 [LOGITS Ex2 A] Mean Abs: 2.008 | Max: 5.907 [LOSS Ex2] A: 0.12863 | B: 0.38318 | C: 0.23008 ** [JOINT LOSS] ** : 0.877195 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008345 | Grad Max: 0.282962 -> Layer: shared_layers.0.bias | Grad Mean: 0.833683 | Grad Max: 3.694321 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.005620 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004785 | Grad Max: 0.004785 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005614 | Grad Max: 0.698312 -> Layer: exit2_layers.0.bias | Grad Mean: 0.104315 | Grad Max: 3.886066 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000703 | Grad Max: 0.025239 -> Layer: exit2_layers.3.bias | Grad Mean: 0.053791 | Grad Max: 0.306871 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.000818 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011080 | Grad Max: 0.021548 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000414 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002831 | Grad Max: 0.006762 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001836 | Grad Max: 0.003335 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048025 | Grad Max: 0.048025 [GRADIENT NORM TOTAL] 18.2798 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.722 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54404104 0.45595896] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.071 [MASKS] A(Pass/Fail): 683/1365 | B: 633/1415 | C: 558/1490 [LOSS Ex1] A: 0.64284 | B: 0.62613 | C: 0.61663 [LOGITS Ex2 A] Mean Abs: 1.965 | Max: 7.018 [LOSS Ex2] A: 0.13634 | B: 0.39764 | C: 0.25553 ** [JOINT LOSS] ** : 0.891705 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.015609 | Grad Max: 0.392005 -> Layer: shared_layers.0.bias | Grad Mean: 1.078942 | Grad Max: 4.337672 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.005770 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006575 | Grad Max: 0.006575 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007188 | Grad Max: 0.848087 -> Layer: exit2_layers.0.bias | Grad Mean: 0.133554 | Grad Max: 4.638885 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000951 | Grad Max: 0.031957 -> Layer: exit2_layers.3.bias | Grad Mean: 0.072255 | Grad Max: 0.382839 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000122 | Grad Max: 0.001127 -> Layer: exit2_layers.6.bias | Grad Mean: 0.014987 | Grad Max: 0.029205 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000574 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003832 | Grad Max: 0.008975 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002613 | Grad Max: 0.004995 -> Layer: exit2_layers.12.bias | Grad Mean: 0.065877 | Grad Max: 0.065877 [GRADIENT NORM TOTAL] 22.3570 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.903 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.76943624 0.23056376] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.075 [MASKS] A(Pass/Fail): 745/1303 | B: 637/1411 | C: 548/1500 [LOSS Ex1] A: 0.63670 | B: 0.62176 | C: 0.61839 [LOGITS Ex2 A] Mean Abs: 2.032 | Max: 6.268 [LOSS Ex2] A: 0.12496 | B: 0.34642 | C: 0.22619 ** [JOINT LOSS] ** : 0.858139 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011047 | Grad Max: 0.282140 -> Layer: shared_layers.0.bias | Grad Mean: 0.729472 | Grad Max: 2.867814 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002293 | Grad Max: 0.005889 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001469 | Grad Max: 0.001469 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004793 | Grad Max: 0.571273 -> Layer: exit2_layers.0.bias | Grad Mean: 0.088359 | Grad Max: 3.185843 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000626 | Grad Max: 0.022206 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047327 | Grad Max: 0.244631 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000770 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009910 | Grad Max: 0.019978 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000424 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002481 | Grad Max: 0.006558 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001615 | Grad Max: 0.003431 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040684 | Grad Max: 0.040684 [GRADIENT NORM TOTAL] 15.0714 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 1.001 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500706 0.499294] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.074 [MASKS] A(Pass/Fail): 717/1331 | B: 598/1258 | C: 573/1475 [LOSS Ex1] A: 0.64350 | B: 0.62611 | C: 0.61977 [LOGITS Ex2 A] Mean Abs: 2.090 | Max: 5.961 [LOSS Ex2] A: 0.10876 | B: 0.31711 | C: 0.24882 ** [JOINT LOSS] ** : 0.854690 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004463 | Grad Max: 0.184434 -> Layer: shared_layers.0.bias | Grad Mean: 0.139690 | Grad Max: 0.751308 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.005628 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001745 | Grad Max: 0.001745 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001121 | Grad Max: 0.268528 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017886 | Grad Max: 1.497478 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000068 | Grad Max: 0.003333 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002727 | Grad Max: 0.024796 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000141 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000320 | Grad Max: 0.002145 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000071 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000092 | Grad Max: 0.000565 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000450 | Grad Max: 0.001180 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000184 | Grad Max: 0.000184 [GRADIENT NORM TOTAL] 3.8267 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.703 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7093755 0.2906245] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.075 [MASKS] A(Pass/Fail): 711/1337 | B: 647/1401 | C: 537/1511 [LOSS Ex1] A: 0.63919 | B: 0.62539 | C: 0.62333 [LOGITS Ex2 A] Mean Abs: 2.117 | Max: 5.657 [LOSS Ex2] A: 0.12988 | B: 0.35478 | C: 0.26834 ** [JOINT LOSS] ** : 0.880304 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005836 | Grad Max: 0.256267 -> Layer: shared_layers.0.bias | Grad Mean: 0.710384 | Grad Max: 3.354811 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.006327 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000842 | Grad Max: 0.000842 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004423 | Grad Max: 0.786702 -> Layer: exit2_layers.0.bias | Grad Mean: 0.081917 | Grad Max: 4.398643 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000563 | Grad Max: 0.022365 -> Layer: exit2_layers.3.bias | Grad Mean: 0.043102 | Grad Max: 0.264595 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000728 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008545 | Grad Max: 0.016842 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000357 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002192 | Grad Max: 0.005750 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001354 | Grad Max: 0.002899 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036675 | Grad Max: 0.036675 [GRADIENT NORM TOTAL] 15.2047 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.812 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62681 0.37319] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.075 [MASKS] A(Pass/Fail): 588/1028 | B: 633/1415 | C: 564/1484 [LOSS Ex1] A: 0.63743 | B: 0.62604 | C: 0.61836 [LOGITS Ex2 A] Mean Abs: 2.155 | Max: 9.296 [LOSS Ex2] A: 0.11506 | B: 0.36004 | C: 0.27416 ** [JOINT LOSS] ** : 0.877027 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006885 | Grad Max: 0.237537 -> Layer: shared_layers.0.bias | Grad Mean: 0.753272 | Grad Max: 3.075963 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.006278 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010051 | Grad Max: 0.010051 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004676 | Grad Max: 0.645649 -> Layer: exit2_layers.0.bias | Grad Mean: 0.087901 | Grad Max: 3.590152 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000615 | Grad Max: 0.024671 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047385 | Grad Max: 0.294335 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000738 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009510 | Grad Max: 0.018615 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000389 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002403 | Grad Max: 0.006106 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001491 | Grad Max: 0.002918 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038808 | Grad Max: 0.038808 [GRADIENT NORM TOTAL] 15.6435 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 1.002 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50749063 0.49250937] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 717/1331 | B: 637/1411 | C: 527/1521 [LOSS Ex1] A: 0.63807 | B: 0.62167 | C: 0.62223 [LOGITS Ex2 A] Mean Abs: 2.111 | Max: 6.797 [LOSS Ex2] A: 0.11512 | B: 0.32270 | C: 0.24144 ** [JOINT LOSS] ** : 0.853745 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004505 | Grad Max: 0.109322 -> Layer: shared_layers.0.bias | Grad Mean: 0.358862 | Grad Max: 1.267490 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002354 | Grad Max: 0.006353 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013261 | Grad Max: 0.013261 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002466 | Grad Max: 0.356395 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045577 | Grad Max: 1.995037 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000324 | Grad Max: 0.011929 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024596 | Grad Max: 0.130720 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000366 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005113 | Grad Max: 0.010350 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000218 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001345 | Grad Max: 0.002919 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000907 | Grad Max: 0.002687 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024142 | Grad Max: 0.024142 [GRADIENT NORM TOTAL] 7.6463 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.945 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50963676 0.49036324] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 709/1339 | B: 598/1258 | C: 542/1506 [LOSS Ex1] A: 0.63492 | B: 0.62602 | C: 0.61958 [LOGITS Ex2 A] Mean Abs: 2.045 | Max: 5.505 [LOSS Ex2] A: 0.11670 | B: 0.34412 | C: 0.22981 ** [JOINT LOSS] ** : 0.857047 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004186 | Grad Max: 0.213070 -> Layer: shared_layers.0.bias | Grad Mean: 0.623315 | Grad Max: 2.835513 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002253 | Grad Max: 0.006086 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003153 | Grad Max: 0.003153 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003904 | Grad Max: 0.672458 -> Layer: exit2_layers.0.bias | Grad Mean: 0.073197 | Grad Max: 3.752946 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000490 | Grad Max: 0.018794 -> Layer: exit2_layers.3.bias | Grad Mean: 0.038102 | Grad Max: 0.217045 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000620 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007646 | Grad Max: 0.015828 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000319 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001932 | Grad Max: 0.004885 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001190 | Grad Max: 0.002733 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031800 | Grad Max: 0.031800 [GRADIENT NORM TOTAL] 13.5155 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.974 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056137 0.49438632] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 716/1332 | B: 647/1401 | C: 541/1507 [LOSS Ex1] A: 0.63392 | B: 0.62530 | C: 0.62061 [LOGITS Ex2 A] Mean Abs: 2.019 | Max: 7.388 [LOSS Ex2] A: 0.13409 | B: 0.39254 | C: 0.22166 ** [JOINT LOSS] ** : 0.876041 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007403 | Grad Max: 0.313602 -> Layer: shared_layers.0.bias | Grad Mean: 0.955535 | Grad Max: 4.262785 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002260 | Grad Max: 0.006654 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002491 | Grad Max: 0.002491 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006153 | Grad Max: 0.986255 -> Layer: exit2_layers.0.bias | Grad Mean: 0.115337 | Grad Max: 5.502318 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000776 | Grad Max: 0.029657 -> Layer: exit2_layers.3.bias | Grad Mean: 0.059945 | Grad Max: 0.342330 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.000911 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011970 | Grad Max: 0.023350 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000469 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003051 | Grad Max: 0.007302 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001901 | Grad Max: 0.003552 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050517 | Grad Max: 0.050517 [GRADIENT NORM TOTAL] 20.7859 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.805 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015696 0.4984303] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.074 [MASKS] A(Pass/Fail): 684/1364 | B: 633/1415 | C: 516/1532 [LOSS Ex1] A: 0.64248 | B: 0.62595 | C: 0.62505 [LOGITS Ex2 A] Mean Abs: 1.999 | Max: 6.158 [LOSS Ex2] A: 0.11269 | B: 0.38451 | C: 0.23100 ** [JOINT LOSS] ** : 0.873895 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006310 | Grad Max: 0.283243 -> Layer: shared_layers.0.bias | Grad Mean: 0.849842 | Grad Max: 3.645814 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002052 | Grad Max: 0.006227 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011994 | Grad Max: 0.011994 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005352 | Grad Max: 0.974500 -> Layer: exit2_layers.0.bias | Grad Mean: 0.100645 | Grad Max: 5.428516 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000696 | Grad Max: 0.027667 -> Layer: exit2_layers.3.bias | Grad Mean: 0.053754 | Grad Max: 0.326982 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000823 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010848 | Grad Max: 0.022049 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000439 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002795 | Grad Max: 0.006959 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001829 | Grad Max: 0.003624 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047806 | Grad Max: 0.047806 [GRADIENT NORM TOTAL] 18.2197 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.123 | Max: 0.724 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439764 0.45602354] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.071 [MASKS] A(Pass/Fail): 683/1365 | B: 637/1411 | C: 554/1494 [LOSS Ex1] A: 0.64273 | B: 0.62159 | C: 0.62137 [LOGITS Ex2 A] Mean Abs: 2.028 | Max: 5.878 [LOSS Ex2] A: 0.12398 | B: 0.32779 | C: 0.25041 ** [JOINT LOSS] ** : 0.862620 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004076 | Grad Max: 0.116352 -> Layer: shared_layers.0.bias | Grad Mean: 0.215826 | Grad Max: 0.947965 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.006562 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011294 | Grad Max: 0.011294 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001428 | Grad Max: 0.598486 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024819 | Grad Max: 3.332857 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000136 | Grad Max: 0.006958 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009805 | Grad Max: 0.077276 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000173 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001774 | Grad Max: 0.004676 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000095 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000461 | Grad Max: 0.001257 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000394 | Grad Max: 0.001373 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008702 | Grad Max: 0.008702 [GRADIENT NORM TOTAL] 5.7023 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.905 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7700944 0.22990565] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.075 [MASKS] A(Pass/Fail): 745/1303 | B: 598/1258 | C: 545/1503 [LOSS Ex1] A: 0.63658 | B: 0.62594 | C: 0.61935 [LOGITS Ex2 A] Mean Abs: 2.120 | Max: 5.739 [LOSS Ex2] A: 0.13017 | B: 0.32662 | C: 0.25213 ** [JOINT LOSS] ** : 0.863594 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.013396 | Grad Max: 0.381280 -> Layer: shared_layers.0.bias | Grad Mean: 0.737213 | Grad Max: 3.098425 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002274 | Grad Max: 0.006511 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007008 | Grad Max: 0.007008 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005039 | Grad Max: 0.699982 -> Layer: exit2_layers.0.bias | Grad Mean: 0.092647 | Grad Max: 3.916414 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000637 | Grad Max: 0.020601 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047952 | Grad Max: 0.239413 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000807 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010144 | Grad Max: 0.019817 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000392 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002624 | Grad Max: 0.005902 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001759 | Grad Max: 0.003721 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043908 | Grad Max: 0.043908 [GRADIENT NORM TOTAL] 15.6191 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 1.003 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006981 0.49930185] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.074 [MASKS] A(Pass/Fail): 717/1331 | B: 647/1401 | C: 564/1484 [LOSS Ex1] A: 0.64339 | B: 0.62523 | C: 0.61734 [LOGITS Ex2 A] Mean Abs: 2.154 | Max: 5.860 [LOSS Ex2] A: 0.13730 | B: 0.37853 | C: 0.27545 ** [JOINT LOSS] ** : 0.892410 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.013400 | Grad Max: 0.357914 -> Layer: shared_layers.0.bias | Grad Mean: 1.034173 | Grad Max: 4.694898 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.006086 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004129 | Grad Max: 0.004129 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006912 | Grad Max: 1.006483 -> Layer: exit2_layers.0.bias | Grad Mean: 0.128723 | Grad Max: 5.599765 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000880 | Grad Max: 0.032282 -> Layer: exit2_layers.3.bias | Grad Mean: 0.067188 | Grad Max: 0.378535 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001092 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013854 | Grad Max: 0.028183 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000045 | Grad Max: 0.000561 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003562 | Grad Max: 0.008317 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002352 | Grad Max: 0.004332 -> Layer: exit2_layers.12.bias | Grad Mean: 0.059520 | Grad Max: 0.059520 [GRADIENT NORM TOTAL] 22.7050 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.705 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.70975107 0.29024896] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.075 [MASKS] A(Pass/Fail): 711/1337 | B: 633/1415 | C: 391/985 [LOSS Ex1] A: 0.63907 | B: 0.62588 | C: 0.61879 [LOGITS Ex2 A] Mean Abs: 2.103 | Max: 5.478 [LOSS Ex2] A: 0.14284 | B: 0.35883 | C: 0.27756 ** [JOINT LOSS] ** : 0.887652 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007715 | Grad Max: 0.323765 -> Layer: shared_layers.0.bias | Grad Mean: 0.843548 | Grad Max: 4.221925 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.005986 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001744 | Grad Max: 0.001744 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005527 | Grad Max: 0.949914 -> Layer: exit2_layers.0.bias | Grad Mean: 0.103360 | Grad Max: 5.261958 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000689 | Grad Max: 0.024535 -> Layer: exit2_layers.3.bias | Grad Mean: 0.053301 | Grad Max: 0.317356 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000905 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010860 | Grad Max: 0.021167 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000461 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002776 | Grad Max: 0.006751 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001776 | Grad Max: 0.003343 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045857 | Grad Max: 0.045857 [GRADIENT NORM TOTAL] 19.1604 [EPOCH SUMMARY] Train Loss: 0.8719 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8421 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8453 -> New: 0.8421) ############################## EPOCH 134/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.814 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6270077 0.37299225] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.075 [MASKS] A(Pass/Fail): 588/1028 | B: 637/1411 | C: 548/1500 [LOSS Ex1] A: 0.63731 | B: 0.62151 | C: 0.61471 [LOGITS Ex2 A] Mean Abs: 2.100 | Max: 8.837 [LOSS Ex2] A: 0.11983 | B: 0.31764 | C: 0.23900 ** [JOINT LOSS] ** : 0.850004 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003151 | Grad Max: 0.080498 -> Layer: shared_layers.0.bias | Grad Mean: 0.153064 | Grad Max: 0.830153 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002312 | Grad Max: 0.006678 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008902 | Grad Max: 0.008902 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001050 | Grad Max: 0.299800 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018229 | Grad Max: 1.658061 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000071 | Grad Max: 0.003648 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004786 | Grad Max: 0.035974 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000187 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000843 | Grad Max: 0.003264 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000075 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000221 | Grad Max: 0.000879 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000426 | Grad Max: 0.001066 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002690 | Grad Max: 0.002690 [GRADIENT NORM TOTAL] 4.0925 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.004 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074773 0.4925227] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 717/1331 | B: 598/1258 | C: 564/1484 [LOSS Ex1] A: 0.63796 | B: 0.62586 | C: 0.62007 [LOGITS Ex2 A] Mean Abs: 2.039 | Max: 6.986 [LOSS Ex2] A: 0.12119 | B: 0.34718 | C: 0.23190 ** [JOINT LOSS] ** : 0.861389 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010179 | Grad Max: 0.267901 -> Layer: shared_layers.0.bias | Grad Mean: 0.715793 | Grad Max: 2.847285 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002207 | Grad Max: 0.006240 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003901 | Grad Max: 0.003901 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004516 | Grad Max: 0.516004 -> Layer: exit2_layers.0.bias | Grad Mean: 0.083813 | Grad Max: 2.780766 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000607 | Grad Max: 0.019671 -> Layer: exit2_layers.3.bias | Grad Mean: 0.046347 | Grad Max: 0.257326 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000782 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009613 | Grad Max: 0.019036 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000402 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002457 | Grad Max: 0.005801 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001625 | Grad Max: 0.003294 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040425 | Grad Max: 0.040425 [GRADIENT NORM TOTAL] 14.1366 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.947 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.509579 0.49042097] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 710/1338 | B: 647/1401 | C: 561/1487 [LOSS Ex1] A: 0.63480 | B: 0.62516 | C: 0.61633 [LOGITS Ex2 A] Mean Abs: 2.004 | Max: 6.359 [LOSS Ex2] A: 0.13100 | B: 0.39022 | C: 0.22606 ** [JOINT LOSS] ** : 0.874522 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010755 | Grad Max: 0.285420 -> Layer: shared_layers.0.bias | Grad Mean: 0.904914 | Grad Max: 3.833302 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002325 | Grad Max: 0.006724 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005306 | Grad Max: 0.005306 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005775 | Grad Max: 0.739996 -> Layer: exit2_layers.0.bias | Grad Mean: 0.107448 | Grad Max: 4.117234 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000736 | Grad Max: 0.028349 -> Layer: exit2_layers.3.bias | Grad Mean: 0.056546 | Grad Max: 0.323814 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.000872 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011683 | Grad Max: 0.022946 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000447 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002995 | Grad Max: 0.007005 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001972 | Grad Max: 0.003447 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050114 | Grad Max: 0.050114 [GRADIENT NORM TOTAL] 18.9026 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.977 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056912 0.49430883] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 716/1332 | B: 633/1415 | C: 544/1504 [LOSS Ex1] A: 0.63380 | B: 0.62580 | C: 0.62197 [LOGITS Ex2 A] Mean Abs: 2.032 | Max: 7.480 [LOSS Ex2] A: 0.13494 | B: 0.36431 | C: 0.27927 ** [JOINT LOSS] ** : 0.886699 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008372 | Grad Max: 0.226738 -> Layer: shared_layers.0.bias | Grad Mean: 0.613035 | Grad Max: 2.683043 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.006540 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001726 | Grad Max: 0.001726 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004076 | Grad Max: 0.556365 -> Layer: exit2_layers.0.bias | Grad Mean: 0.076006 | Grad Max: 3.105547 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000529 | Grad Max: 0.022490 -> Layer: exit2_layers.3.bias | Grad Mean: 0.040672 | Grad Max: 0.244422 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000674 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008401 | Grad Max: 0.017050 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000338 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002212 | Grad Max: 0.005445 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001514 | Grad Max: 0.002928 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038650 | Grad Max: 0.038650 [GRADIENT NORM TOTAL] 13.0846 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.807 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016075 0.4983925] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.556 | Std: 0.074 [MASKS] A(Pass/Fail): 684/1364 | B: 637/1411 | C: 522/1526 [LOSS Ex1] A: 0.64236 | B: 0.62144 | C: 0.62010 [LOGITS Ex2 A] Mean Abs: 2.038 | Max: 5.308 [LOSS Ex2] A: 0.11935 | B: 0.31742 | C: 0.22558 ** [JOINT LOSS] ** : 0.848753 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002379 | Grad Max: 0.086341 -> Layer: shared_layers.0.bias | Grad Mean: 0.214661 | Grad Max: 1.133284 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.005686 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005579 | Grad Max: 0.005579 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001383 | Grad Max: 0.260317 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025310 | Grad Max: 1.464876 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000150 | Grad Max: 0.008111 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011301 | Grad Max: 0.091448 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000248 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002238 | Grad Max: 0.005522 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000107 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000575 | Grad Max: 0.001490 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000560 | Grad Max: 0.001694 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009289 | Grad Max: 0.009289 [GRADIENT NORM TOTAL] 4.7845 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.726 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439173 0.45608273] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.555 | Std: 0.071 [MASKS] A(Pass/Fail): 683/1365 | B: 598/1258 | C: 553/1495 [LOSS Ex1] A: 0.64261 | B: 0.62579 | C: 0.62121 [LOGITS Ex2 A] Mean Abs: 2.051 | Max: 5.978 [LOSS Ex2] A: 0.13198 | B: 0.31838 | C: 0.25612 ** [JOINT LOSS] ** : 0.865362 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005790 | Grad Max: 0.179241 -> Layer: shared_layers.0.bias | Grad Mean: 0.494274 | Grad Max: 2.299472 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.005628 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007872 | Grad Max: 0.007872 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003154 | Grad Max: 0.426194 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058426 | Grad Max: 2.382436 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000402 | Grad Max: 0.015477 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030770 | Grad Max: 0.186290 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000573 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006344 | Grad Max: 0.014038 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000258 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001646 | Grad Max: 0.003977 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001050 | Grad Max: 0.002562 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027325 | Grad Max: 0.027325 [GRADIENT NORM TOTAL] 10.4173 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.908 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.77066547 0.22933453] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.075 [MASKS] A(Pass/Fail): 745/1303 | B: 647/1401 | C: 552/1496 [LOSS Ex1] A: 0.63646 | B: 0.62508 | C: 0.62270 [LOGITS Ex2 A] Mean Abs: 2.090 | Max: 6.363 [LOSS Ex2] A: 0.11286 | B: 0.33424 | C: 0.24102 ** [JOINT LOSS] ** : 0.857453 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006433 | Grad Max: 0.174484 -> Layer: shared_layers.0.bias | Grad Mean: 0.325216 | Grad Max: 1.102158 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002237 | Grad Max: 0.006279 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006588 | Grad Max: 0.006588 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.281465 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040695 | Grad Max: 1.553119 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000297 | Grad Max: 0.008442 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022228 | Grad Max: 0.107078 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000414 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004690 | Grad Max: 0.009669 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000213 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001212 | Grad Max: 0.003179 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000802 | Grad Max: 0.002233 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020088 | Grad Max: 0.020088 [GRADIENT NORM TOTAL] 6.6148 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 1.007 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006424 0.49935755] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.074 [MASKS] A(Pass/Fail): 717/1331 | B: 633/1415 | C: 552/1496 [LOSS Ex1] A: 0.64327 | B: 0.62573 | C: 0.62231 [LOGITS Ex2 A] Mean Abs: 2.028 | Max: 5.836 [LOSS Ex2] A: 0.10524 | B: 0.35045 | C: 0.24056 ** [JOINT LOSS] ** : 0.862519 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004484 | Grad Max: 0.150804 -> Layer: shared_layers.0.bias | Grad Mean: 0.482897 | Grad Max: 2.131352 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005781 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000303 | Grad Max: 0.000303 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002969 | Grad Max: 0.533972 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055770 | Grad Max: 3.010740 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000369 | Grad Max: 0.013644 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028619 | Grad Max: 0.163690 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000436 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005797 | Grad Max: 0.011423 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000245 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001476 | Grad Max: 0.003599 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000963 | Grad Max: 0.002375 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024575 | Grad Max: 0.024575 [GRADIENT NORM TOTAL] 10.2885 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.707 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7101686 0.28983137] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.075 [MASKS] A(Pass/Fail): 711/1337 | B: 637/1411 | C: 562/1486 [LOSS Ex1] A: 0.63894 | B: 0.62135 | C: 0.61950 [LOGITS Ex2 A] Mean Abs: 2.009 | Max: 6.018 [LOSS Ex2] A: 0.13680 | B: 0.33866 | C: 0.25709 ** [JOINT LOSS] ** : 0.870779 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007853 | Grad Max: 0.198707 -> Layer: shared_layers.0.bias | Grad Mean: 0.604626 | Grad Max: 2.558968 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002200 | Grad Max: 0.005880 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000289 | Grad Max: 0.000289 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003838 | Grad Max: 0.599604 -> Layer: exit2_layers.0.bias | Grad Mean: 0.071500 | Grad Max: 3.367372 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000468 | Grad Max: 0.015353 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035807 | Grad Max: 0.193759 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000582 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007402 | Grad Max: 0.014274 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000312 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001938 | Grad Max: 0.004657 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001307 | Grad Max: 0.002551 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033278 | Grad Max: 0.033278 [GRADIENT NORM TOTAL] 12.7597 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.816 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6271701 0.37282997] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.075 [MASKS] A(Pass/Fail): 588/1028 | B: 598/1258 | C: 576/1472 [LOSS Ex1] A: 0.63718 | B: 0.62571 | C: 0.61850 [LOGITS Ex2 A] Mean Abs: 2.090 | Max: 8.241 [LOSS Ex2] A: 0.11312 | B: 0.32599 | C: 0.23915 ** [JOINT LOSS] ** : 0.853216 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005020 | Grad Max: 0.135628 -> Layer: shared_layers.0.bias | Grad Mean: 0.402868 | Grad Max: 1.706349 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.006866 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013702 | Grad Max: 0.013702 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002460 | Grad Max: 0.365297 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045919 | Grad Max: 2.038866 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000339 | Grad Max: 0.014651 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025832 | Grad Max: 0.155403 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000413 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005222 | Grad Max: 0.010592 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000244 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001343 | Grad Max: 0.003485 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000907 | Grad Max: 0.002377 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022743 | Grad Max: 0.022743 [GRADIENT NORM TOTAL] 8.0088 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.008 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5075223 0.49247777] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 717/1331 | B: 647/1401 | C: 556/1492 [LOSS Ex1] A: 0.63783 | B: 0.62500 | C: 0.62449 [LOGITS Ex2 A] Mean Abs: 2.089 | Max: 7.482 [LOSS Ex2] A: 0.11754 | B: 0.35272 | C: 0.25241 ** [JOINT LOSS] ** : 0.869997 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006533 | Grad Max: 0.195265 -> Layer: shared_layers.0.bias | Grad Mean: 0.534950 | Grad Max: 2.498626 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006069 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003679 | Grad Max: 0.003679 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003626 | Grad Max: 0.601007 -> Layer: exit2_layers.0.bias | Grad Mean: 0.067463 | Grad Max: 3.348502 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000446 | Grad Max: 0.015444 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034076 | Grad Max: 0.186182 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000563 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007048 | Grad Max: 0.014223 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000269 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001833 | Grad Max: 0.004415 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001169 | Grad Max: 0.002675 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030277 | Grad Max: 0.030277 [GRADIENT NORM TOTAL] 12.1803 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.950 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5094571 0.4905429] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 710/1338 | B: 633/1415 | C: 530/1518 [LOSS Ex1] A: 0.63467 | B: 0.62564 | C: 0.62065 [LOGITS Ex2 A] Mean Abs: 2.118 | Max: 6.159 [LOSS Ex2] A: 0.13064 | B: 0.36000 | C: 0.25676 ** [JOINT LOSS] ** : 0.876123 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008920 | Grad Max: 0.317341 -> Layer: shared_layers.0.bias | Grad Mean: 0.836068 | Grad Max: 4.156336 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.006202 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002807 | Grad Max: 0.002807 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005413 | Grad Max: 0.890409 -> Layer: exit2_layers.0.bias | Grad Mean: 0.101583 | Grad Max: 4.945156 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000673 | Grad Max: 0.023896 -> Layer: exit2_layers.3.bias | Grad Mean: 0.052314 | Grad Max: 0.288752 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000870 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010696 | Grad Max: 0.021345 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000441 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002772 | Grad Max: 0.006724 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001796 | Grad Max: 0.003634 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046030 | Grad Max: 0.046030 [GRADIENT NORM TOTAL] 18.5532 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.980 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50574607 0.49425387] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 716/1332 | B: 637/1411 | C: 551/1497 [LOSS Ex1] A: 0.63367 | B: 0.62127 | C: 0.61626 [LOGITS Ex2 A] Mean Abs: 2.088 | Max: 5.975 [LOSS Ex2] A: 0.13592 | B: 0.32244 | C: 0.24655 ** [JOINT LOSS] ** : 0.858706 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006851 | Grad Max: 0.223153 -> Layer: shared_layers.0.bias | Grad Mean: 0.606314 | Grad Max: 2.878774 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002380 | Grad Max: 0.006449 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005286 | Grad Max: 0.005286 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004003 | Grad Max: 0.609850 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074348 | Grad Max: 3.424419 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000504 | Grad Max: 0.017349 -> Layer: exit2_layers.3.bias | Grad Mean: 0.038924 | Grad Max: 0.220809 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000607 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008100 | Grad Max: 0.015525 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000348 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002118 | Grad Max: 0.005407 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001405 | Grad Max: 0.003183 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035809 | Grad Max: 0.035809 [GRADIENT NORM TOTAL] 13.4180 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.809 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016026 0.49839744] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.074 [MASKS] A(Pass/Fail): 684/1364 | B: 600/1256 | C: 362/1014 [LOSS Ex1] A: 0.64223 | B: 0.62563 | C: 0.62020 [LOGITS Ex2 A] Mean Abs: 2.005 | Max: 5.937 [LOSS Ex2] A: 0.11687 | B: 0.32271 | C: 0.21754 ** [JOINT LOSS] ** : 0.848395 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004442 | Grad Max: 0.136458 -> Layer: shared_layers.0.bias | Grad Mean: 0.221505 | Grad Max: 0.887782 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.006355 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009800 | Grad Max: 0.009800 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001383 | Grad Max: 0.155693 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024874 | Grad Max: 0.796285 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.006839 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013746 | Grad Max: 0.058904 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000304 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002939 | Grad Max: 0.007633 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000161 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000721 | Grad Max: 0.002269 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000530 | Grad Max: 0.001859 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010746 | Grad Max: 0.010746 [GRADIENT NORM TOTAL] 4.1589 [EPOCH SUMMARY] Train Loss: 0.8631 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8427 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 135/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.728 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438818 0.45611823] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.072 [MASKS] A(Pass/Fail): 683/1365 | B: 647/1401 | C: 563/1485 [LOSS Ex1] A: 0.64248 | B: 0.62492 | C: 0.61967 [LOGITS Ex2 A] Mean Abs: 1.978 | Max: 6.023 [LOSS Ex2] A: 0.12490 | B: 0.34802 | C: 0.23917 ** [JOINT LOSS] ** : 0.866391 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003022 | Grad Max: 0.091965 -> Layer: shared_layers.0.bias | Grad Mean: 0.287283 | Grad Max: 1.274730 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.005988 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005401 | Grad Max: 0.005401 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001743 | Grad Max: 0.600188 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032431 | Grad Max: 3.348663 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000218 | Grad Max: 0.007287 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016753 | Grad Max: 0.090358 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000342 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003398 | Grad Max: 0.007793 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000159 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000869 | Grad Max: 0.002294 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000554 | Grad Max: 0.001722 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014350 | Grad Max: 0.014350 [GRADIENT NORM TOTAL] 6.8996 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.911 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.77128774 0.22871223] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.076 [MASKS] A(Pass/Fail): 744/1304 | B: 634/1414 | C: 566/1482 [LOSS Ex1] A: 0.63633 | B: 0.62556 | C: 0.61720 [LOGITS Ex2 A] Mean Abs: 2.065 | Max: 5.636 [LOSS Ex2] A: 0.11781 | B: 0.34000 | C: 0.22905 ** [JOINT LOSS] ** : 0.855318 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003797 | Grad Max: 0.097489 -> Layer: shared_layers.0.bias | Grad Mean: 0.113416 | Grad Max: 0.438873 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002226 | Grad Max: 0.005772 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003208 | Grad Max: 0.003208 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000932 | Grad Max: 0.158976 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016561 | Grad Max: 0.893350 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000093 | Grad Max: 0.005851 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006502 | Grad Max: 0.057710 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000188 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001475 | Grad Max: 0.004062 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000107 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000405 | Grad Max: 0.001268 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000470 | Grad Max: 0.001405 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007373 | Grad Max: 0.007373 [GRADIENT NORM TOTAL] 2.9405 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 1.010 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005774 0.49942264] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.074 [MASKS] A(Pass/Fail): 717/1331 | B: 637/1411 | C: 560/1488 [LOSS Ex1] A: 0.64314 | B: 0.62119 | C: 0.61682 [LOGITS Ex2 A] Mean Abs: 2.072 | Max: 5.492 [LOSS Ex2] A: 0.10312 | B: 0.31729 | C: 0.21049 ** [JOINT LOSS] ** : 0.837344 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001921 | Grad Max: 0.043189 -> Layer: shared_layers.0.bias | Grad Mean: 0.125226 | Grad Max: 0.539824 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.005694 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001507 | Grad Max: 0.001507 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000662 | Grad Max: 0.360826 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011468 | Grad Max: 1.977722 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000039 | Grad Max: 0.002674 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001908 | Grad Max: 0.016267 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000128 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000364 | Grad Max: 0.002430 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000070 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000128 | Grad Max: 0.000695 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000637 | Grad Max: 0.001429 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001328 | Grad Max: 0.001328 [GRADIENT NORM TOTAL] 3.8245 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.709 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.710594 0.289406] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.075 [MASKS] A(Pass/Fail): 712/1336 | B: 600/1256 | C: 565/1483 [LOSS Ex1] A: 0.63879 | B: 0.62553 | C: 0.61734 [LOGITS Ex2 A] Mean Abs: 2.069 | Max: 5.581 [LOSS Ex2] A: 0.13085 | B: 0.31436 | C: 0.23428 ** [JOINT LOSS] ** : 0.853718 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003286 | Grad Max: 0.067784 -> Layer: shared_layers.0.bias | Grad Mean: 0.194833 | Grad Max: 0.924442 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006027 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004843 | Grad Max: 0.004843 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001275 | Grad Max: 0.422251 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023182 | Grad Max: 2.369298 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000114 | Grad Max: 0.004331 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008290 | Grad Max: 0.047160 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000214 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001756 | Grad Max: 0.005001 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000109 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000442 | Grad Max: 0.001389 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000414 | Grad Max: 0.001359 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006934 | Grad Max: 0.006934 [GRADIENT NORM TOTAL] 5.2124 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.819 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6273579 0.37264213] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 588/1028 | B: 647/1401 | C: 560/1488 [LOSS Ex1] A: 0.63702 | B: 0.62482 | C: 0.61849 [LOGITS Ex2 A] Mean Abs: 2.112 | Max: 6.868 [LOSS Ex2] A: 0.11668 | B: 0.34671 | C: 0.22906 ** [JOINT LOSS] ** : 0.857593 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001781 | Grad Max: 0.090071 -> Layer: shared_layers.0.bias | Grad Mean: 0.129846 | Grad Max: 1.101399 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.006179 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002934 | Grad Max: 0.002934 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000981 | Grad Max: 0.278728 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017599 | Grad Max: 1.551839 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000097 | Grad Max: 0.006464 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007243 | Grad Max: 0.058905 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000217 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001378 | Grad Max: 0.003968 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000089 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000340 | Grad Max: 0.001041 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000357 | Grad Max: 0.001039 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004430 | Grad Max: 0.004430 [GRADIENT NORM TOTAL] 3.7241 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.011 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50757766 0.4924223 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 717/1331 | B: 635/1413 | C: 548/1500 [LOSS Ex1] A: 0.63767 | B: 0.62545 | C: 0.61971 [LOGITS Ex2 A] Mean Abs: 2.106 | Max: 8.026 [LOSS Ex2] A: 0.11256 | B: 0.34007 | C: 0.23433 ** [JOINT LOSS] ** : 0.856596 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001850 | Grad Max: 0.049770 -> Layer: shared_layers.0.bias | Grad Mean: 0.074632 | Grad Max: 0.346958 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002154 | Grad Max: 0.006028 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000304 | Grad Max: 0.000304 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000593 | Grad Max: 0.120398 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010483 | Grad Max: 0.662506 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.002621 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003072 | Grad Max: 0.024801 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000151 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000581 | Grad Max: 0.002784 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000071 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000150 | Grad Max: 0.000750 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000434 | Grad Max: 0.001279 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002856 | Grad Max: 0.002856 [GRADIENT NORM TOTAL] 2.0353 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.954 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093446 0.49065536] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 711/1337 | B: 637/1411 | C: 543/1505 [LOSS Ex1] A: 0.63449 | B: 0.62105 | C: 0.61984 [LOGITS Ex2 A] Mean Abs: 2.112 | Max: 6.254 [LOSS Ex2] A: 0.12030 | B: 0.31771 | C: 0.24439 ** [JOINT LOSS] ** : 0.852592 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002394 | Grad Max: 0.060113 -> Layer: shared_layers.0.bias | Grad Mean: 0.141248 | Grad Max: 0.764796 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002289 | Grad Max: 0.006086 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001869 | Grad Max: 0.001869 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001000 | Grad Max: 0.209856 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018073 | Grad Max: 1.162398 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000102 | Grad Max: 0.004782 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007773 | Grad Max: 0.051027 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000210 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001592 | Grad Max: 0.004615 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000091 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000407 | Grad Max: 0.001173 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000419 | Grad Max: 0.001411 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007024 | Grad Max: 0.007024 [GRADIENT NORM TOTAL] 3.4739 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.984 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.505875 0.49412507] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 716/1332 | B: 600/1256 | C: 565/1483 [LOSS Ex1] A: 0.63348 | B: 0.62539 | C: 0.61567 [LOGITS Ex2 A] Mean Abs: 2.084 | Max: 9.020 [LOSS Ex2] A: 0.12323 | B: 0.32853 | C: 0.22955 ** [JOINT LOSS] ** : 0.851944 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003337 | Grad Max: 0.117587 -> Layer: shared_layers.0.bias | Grad Mean: 0.231074 | Grad Max: 0.978036 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002294 | Grad Max: 0.006393 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001501 | Grad Max: 0.001501 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001540 | Grad Max: 0.190653 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026723 | Grad Max: 1.055891 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000167 | Grad Max: 0.006855 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012167 | Grad Max: 0.083709 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000232 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002229 | Grad Max: 0.006065 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000115 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000585 | Grad Max: 0.001543 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001351 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010379 | Grad Max: 0.010379 [GRADIENT NORM TOTAL] 4.7280 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.813 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016365 0.4983635] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.074 [MASKS] A(Pass/Fail): 684/1364 | B: 648/1400 | C: 524/1524 [LOSS Ex1] A: 0.64203 | B: 0.62466 | C: 0.62450 [LOGITS Ex2 A] Mean Abs: 2.091 | Max: 6.056 [LOSS Ex2] A: 0.11721 | B: 0.34137 | C: 0.27368 ** [JOINT LOSS] ** : 0.874485 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003078 | Grad Max: 0.073093 -> Layer: shared_layers.0.bias | Grad Mean: 0.137802 | Grad Max: 0.788849 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.005751 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005517 | Grad Max: 0.005517 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000819 | Grad Max: 0.437790 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013689 | Grad Max: 2.450477 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.003287 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002402 | Grad Max: 0.021946 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000130 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000551 | Grad Max: 0.002760 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000063 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000197 | Grad Max: 0.000947 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000227 | Grad Max: 0.000769 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005556 | Grad Max: 0.005556 [GRADIENT NORM TOTAL] 3.9368 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.731 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438446 0.4561554] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.072 [MASKS] A(Pass/Fail): 683/1365 | B: 635/1413 | C: 541/1507 [LOSS Ex1] A: 0.64228 | B: 0.62527 | C: 0.62191 [LOGITS Ex2 A] Mean Abs: 2.124 | Max: 6.116 [LOSS Ex2] A: 0.11743 | B: 0.33542 | C: 0.25674 ** [JOINT LOSS] ** : 0.866353 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002233 | Grad Max: 0.069044 -> Layer: shared_layers.0.bias | Grad Mean: 0.208314 | Grad Max: 0.971210 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.005486 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003518 | Grad Max: 0.003518 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001393 | Grad Max: 0.263251 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025499 | Grad Max: 1.475172 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000149 | Grad Max: 0.008065 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011519 | Grad Max: 0.088324 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000254 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002198 | Grad Max: 0.005591 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000117 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000548 | Grad Max: 0.001486 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000344 | Grad Max: 0.001221 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008545 | Grad Max: 0.008545 [GRADIENT NORM TOTAL] 4.8298 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.916 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7723748 0.22762527] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.076 [MASKS] A(Pass/Fail): 744/1304 | B: 638/1410 | C: 567/1481 [LOSS Ex1] A: 0.63611 | B: 0.62087 | C: 0.61930 [LOGITS Ex2 A] Mean Abs: 2.118 | Max: 5.593 [LOSS Ex2] A: 0.11154 | B: 0.32179 | C: 0.23656 ** [JOINT LOSS] ** : 0.848724 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001904 | Grad Max: 0.057727 -> Layer: shared_layers.0.bias | Grad Mean: 0.113522 | Grad Max: 0.809070 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002260 | Grad Max: 0.006340 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003366 | Grad Max: 0.003366 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000772 | Grad Max: 0.329942 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013748 | Grad Max: 1.836416 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.002839 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002723 | Grad Max: 0.021703 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000137 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000468 | Grad Max: 0.002519 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000089 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000106 | Grad Max: 0.000505 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001024 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000537 | Grad Max: 0.000537 [GRADIENT NORM TOTAL] 3.5591 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 1.016 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50055236 0.49944767] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 717/1331 | B: 600/1256 | C: 547/1501 [LOSS Ex1] A: 0.64292 | B: 0.62521 | C: 0.61716 [LOGITS Ex2 A] Mean Abs: 2.113 | Max: 6.141 [LOSS Ex2] A: 0.10776 | B: 0.33609 | C: 0.20790 ** [JOINT LOSS] ** : 0.845678 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005068 | Grad Max: 0.171448 -> Layer: shared_layers.0.bias | Grad Mean: 0.334059 | Grad Max: 1.567778 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.005436 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003976 | Grad Max: 0.003976 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.386953 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040160 | Grad Max: 2.171730 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000288 | Grad Max: 0.010675 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022128 | Grad Max: 0.116522 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000457 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004632 | Grad Max: 0.009965 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000238 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001171 | Grad Max: 0.003253 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000820 | Grad Max: 0.002402 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019110 | Grad Max: 0.019110 [GRADIENT NORM TOTAL] 7.1770 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.713 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.71135134 0.2886487 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.076 [MASKS] A(Pass/Fail): 712/1336 | B: 648/1400 | C: 534/1514 [LOSS Ex1] A: 0.63855 | B: 0.62449 | C: 0.62511 [LOGITS Ex2 A] Mean Abs: 2.115 | Max: 6.033 [LOSS Ex2] A: 0.13757 | B: 0.33948 | C: 0.26045 ** [JOINT LOSS] ** : 0.875217 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003955 | Grad Max: 0.127805 -> Layer: shared_layers.0.bias | Grad Mean: 0.180974 | Grad Max: 0.958177 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.006094 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001540 | Grad Max: 0.001540 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001257 | Grad Max: 0.165826 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022180 | Grad Max: 0.801565 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000146 | Grad Max: 0.004915 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010910 | Grad Max: 0.058415 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000228 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002385 | Grad Max: 0.005747 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000144 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000651 | Grad Max: 0.001773 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000533 | Grad Max: 0.001552 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012541 | Grad Max: 0.012541 [GRADIENT NORM TOTAL] 3.8253 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.824 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6277436 0.37225637] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.076 [MASKS] A(Pass/Fail): 588/1028 | B: 635/1413 | C: 376/1000 [LOSS Ex1] A: 0.63677 | B: 0.62509 | C: 0.62161 [LOGITS Ex2 A] Mean Abs: 2.214 | Max: 8.392 [LOSS Ex2] A: 0.12123 | B: 0.35392 | C: 0.26701 ** [JOINT LOSS] ** : 0.875211 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006634 | Grad Max: 0.271393 -> Layer: shared_layers.0.bias | Grad Mean: 0.660260 | Grad Max: 3.548107 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.006130 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008947 | Grad Max: 0.008947 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004335 | Grad Max: 0.786195 -> Layer: exit2_layers.0.bias | Grad Mean: 0.080710 | Grad Max: 4.377571 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000518 | Grad Max: 0.019587 -> Layer: exit2_layers.3.bias | Grad Mean: 0.040424 | Grad Max: 0.219390 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000668 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008217 | Grad Max: 0.016641 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000396 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002079 | Grad Max: 0.005286 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001347 | Grad Max: 0.002620 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032649 | Grad Max: 0.032649 [GRADIENT NORM TOTAL] 15.1453 [EPOCH SUMMARY] Train Loss: 0.8584 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8535 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 136/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.018 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076026 0.49239737] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 717/1331 | B: 638/1410 | C: 556/1492 [LOSS Ex1] A: 0.63742 | B: 0.62069 | C: 0.61906 [LOGITS Ex2 A] Mean Abs: 2.177 | Max: 7.438 [LOSS Ex2] A: 0.12242 | B: 0.33823 | C: 0.25633 ** [JOINT LOSS] ** : 0.864719 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009357 | Grad Max: 0.296524 -> Layer: shared_layers.0.bias | Grad Mean: 0.809483 | Grad Max: 3.858690 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002235 | Grad Max: 0.005936 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001558 | Grad Max: 0.001558 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005334 | Grad Max: 0.783591 -> Layer: exit2_layers.0.bias | Grad Mean: 0.099053 | Grad Max: 4.357076 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000641 | Grad Max: 0.021983 -> Layer: exit2_layers.3.bias | Grad Mean: 0.049701 | Grad Max: 0.253886 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000815 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010213 | Grad Max: 0.020458 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000416 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002624 | Grad Max: 0.006570 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001757 | Grad Max: 0.003515 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043310 | Grad Max: 0.043310 [GRADIENT NORM TOTAL] 17.8387 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.960 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50926834 0.49073163] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 711/1337 | B: 600/1256 | C: 579/1469 [LOSS Ex1] A: 0.63424 | B: 0.62503 | C: 0.61744 [LOGITS Ex2 A] Mean Abs: 2.168 | Max: 6.265 [LOSS Ex2] A: 0.11726 | B: 0.32062 | C: 0.21806 ** [JOINT LOSS] ** : 0.844217 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007325 | Grad Max: 0.155037 -> Layer: shared_layers.0.bias | Grad Mean: 0.423434 | Grad Max: 1.804322 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002298 | Grad Max: 0.006877 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006248 | Grad Max: 0.006248 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003025 | Grad Max: 0.351640 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055948 | Grad Max: 1.935275 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000381 | Grad Max: 0.013418 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028950 | Grad Max: 0.160872 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000540 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006047 | Grad Max: 0.012770 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000315 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001608 | Grad Max: 0.004165 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001117 | Grad Max: 0.002997 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027841 | Grad Max: 0.027841 [GRADIENT NORM TOTAL] 9.1708 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.990 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50599855 0.49400142] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 716/1332 | B: 648/1400 | C: 559/1489 [LOSS Ex1] A: 0.63323 | B: 0.62432 | C: 0.62116 [LOGITS Ex2 A] Mean Abs: 2.076 | Max: 6.916 [LOSS Ex2] A: 0.13522 | B: 0.35311 | C: 0.23427 ** [JOINT LOSS] ** : 0.867102 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004468 | Grad Max: 0.188422 -> Layer: shared_layers.0.bias | Grad Mean: 0.476903 | Grad Max: 2.556624 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002213 | Grad Max: 0.006088 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001324 | Grad Max: 0.001324 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002921 | Grad Max: 0.554850 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053507 | Grad Max: 3.075500 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.015736 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024091 | Grad Max: 0.167933 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000413 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004754 | Grad Max: 0.009862 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000192 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001232 | Grad Max: 0.002991 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000839 | Grad Max: 0.002043 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021085 | Grad Max: 0.021085 [GRADIENT NORM TOTAL] 10.8021 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.818 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50163054 0.4983695 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.075 [MASKS] A(Pass/Fail): 684/1364 | B: 635/1413 | C: 564/1484 [LOSS Ex1] A: 0.64181 | B: 0.62493 | C: 0.61817 [LOGITS Ex2 A] Mean Abs: 2.044 | Max: 5.712 [LOSS Ex2] A: 0.11853 | B: 0.36237 | C: 0.25743 ** [JOINT LOSS] ** : 0.874411 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008773 | Grad Max: 0.280249 -> Layer: shared_layers.0.bias | Grad Mean: 0.827290 | Grad Max: 3.561022 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006331 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010744 | Grad Max: 0.010744 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005230 | Grad Max: 0.774873 -> Layer: exit2_layers.0.bias | Grad Mean: 0.098552 | Grad Max: 4.336165 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000667 | Grad Max: 0.022421 -> Layer: exit2_layers.3.bias | Grad Mean: 0.051966 | Grad Max: 0.266315 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000781 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010513 | Grad Max: 0.020186 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000448 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002693 | Grad Max: 0.006615 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001859 | Grad Max: 0.003542 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045155 | Grad Max: 0.045155 [GRADIENT NORM TOTAL] 17.7050 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.124 | Max: 0.735 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54389197 0.45610803] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.072 [MASKS] A(Pass/Fail): 683/1365 | B: 638/1410 | C: 550/1498 [LOSS Ex1] A: 0.64207 | B: 0.62054 | C: 0.61831 [LOGITS Ex2 A] Mean Abs: 2.028 | Max: 5.702 [LOSS Ex2] A: 0.12411 | B: 0.33631 | C: 0.22939 ** [JOINT LOSS] ** : 0.856909 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006574 | Grad Max: 0.182200 -> Layer: shared_layers.0.bias | Grad Mean: 0.575649 | Grad Max: 2.523529 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.005826 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006064 | Grad Max: 0.006064 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003574 | Grad Max: 0.595188 -> Layer: exit2_layers.0.bias | Grad Mean: 0.067371 | Grad Max: 3.363993 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000450 | Grad Max: 0.015499 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034832 | Grad Max: 0.179254 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000552 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006988 | Grad Max: 0.014689 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000318 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001780 | Grad Max: 0.004541 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001197 | Grad Max: 0.002835 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029073 | Grad Max: 0.029073 [GRADIENT NORM TOTAL] 12.3312 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.921 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.77354807 0.22645193] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.076 [MASKS] A(Pass/Fail): 744/1304 | B: 600/1256 | C: 543/1505 [LOSS Ex1] A: 0.63590 | B: 0.62488 | C: 0.62126 [LOGITS Ex2 A] Mean Abs: 2.105 | Max: 6.107 [LOSS Ex2] A: 0.11096 | B: 0.32448 | C: 0.25985 ** [JOINT LOSS] ** : 0.859113 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004320 | Grad Max: 0.153862 -> Layer: shared_layers.0.bias | Grad Mean: 0.115095 | Grad Max: 0.524855 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.005822 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000342 | Grad Max: 0.000342 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001061 | Grad Max: 0.189323 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018051 | Grad Max: 1.062341 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.005326 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007930 | Grad Max: 0.058952 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000231 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001849 | Grad Max: 0.004988 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000118 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000489 | Grad Max: 0.001490 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000375 | Grad Max: 0.001241 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007945 | Grad Max: 0.007945 [GRADIENT NORM TOTAL] 2.9779 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 1.021 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005431 0.4994569] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 718/1330 | B: 648/1400 | C: 596/1452 [LOSS Ex1] A: 0.64272 | B: 0.62418 | C: 0.61780 [LOGITS Ex2 A] Mean Abs: 2.139 | Max: 6.353 [LOSS Ex2] A: 0.10624 | B: 0.33564 | C: 0.23605 ** [JOINT LOSS] ** : 0.854212 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002380 | Grad Max: 0.092961 -> Layer: shared_layers.0.bias | Grad Mean: 0.229213 | Grad Max: 0.940369 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.005833 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001528 | Grad Max: 0.001528 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001545 | Grad Max: 0.243792 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028452 | Grad Max: 1.355964 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000191 | Grad Max: 0.007057 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014851 | Grad Max: 0.077911 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000301 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002994 | Grad Max: 0.006866 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000154 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000789 | Grad Max: 0.001826 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000501 | Grad Max: 0.001743 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012893 | Grad Max: 0.012893 [GRADIENT NORM TOTAL] 5.1248 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.717 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.712185 0.28781497] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.076 [MASKS] A(Pass/Fail): 713/1335 | B: 635/1413 | C: 565/1483 [LOSS Ex1] A: 0.63835 | B: 0.62479 | C: 0.61290 [LOGITS Ex2 A] Mean Abs: 2.105 | Max: 5.965 [LOSS Ex2] A: 0.13174 | B: 0.33848 | C: 0.22916 ** [JOINT LOSS] ** : 0.858472 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004069 | Grad Max: 0.138012 -> Layer: shared_layers.0.bias | Grad Mean: 0.121883 | Grad Max: 0.415941 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.006117 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002626 | Grad Max: 0.002626 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000956 | Grad Max: 0.351677 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016544 | Grad Max: 1.980028 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.003321 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005864 | Grad Max: 0.033558 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000243 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001342 | Grad Max: 0.004796 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000101 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000325 | Grad Max: 0.001141 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000349 | Grad Max: 0.001228 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004873 | Grad Max: 0.004873 [GRADIENT NORM TOTAL] 3.6294 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.828 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62821823 0.37178177] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.076 [MASKS] A(Pass/Fail): 588/1028 | B: 638/1410 | C: 580/1468 [LOSS Ex1] A: 0.63657 | B: 0.62040 | C: 0.61793 [LOGITS Ex2 A] Mean Abs: 2.131 | Max: 7.070 [LOSS Ex2] A: 0.11672 | B: 0.31727 | C: 0.24565 ** [JOINT LOSS] ** : 0.851512 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003523 | Grad Max: 0.137655 -> Layer: shared_layers.0.bias | Grad Mean: 0.077980 | Grad Max: 0.322393 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.006577 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010357 | Grad Max: 0.010357 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000911 | Grad Max: 0.111592 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015339 | Grad Max: 0.623305 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000107 | Grad Max: 0.005139 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007588 | Grad Max: 0.051329 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000219 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001785 | Grad Max: 0.004844 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000122 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000469 | Grad Max: 0.001226 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000397 | Grad Max: 0.001456 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008098 | Grad Max: 0.008098 [GRADIENT NORM TOTAL] 2.3260 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.022 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50762016 0.49237987] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.077 [MASKS] A(Pass/Fail): 717/1331 | B: 600/1256 | C: 547/1501 [LOSS Ex1] A: 0.63721 | B: 0.62473 | C: 0.61767 [LOGITS Ex2 A] Mean Abs: 2.160 | Max: 8.386 [LOSS Ex2] A: 0.10701 | B: 0.31814 | C: 0.26023 ** [JOINT LOSS] ** : 0.855000 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004745 | Grad Max: 0.159380 -> Layer: shared_layers.0.bias | Grad Mean: 0.382879 | Grad Max: 1.849231 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.005875 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001134 | Grad Max: 0.001134 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002574 | Grad Max: 0.415286 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047059 | Grad Max: 2.311259 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000301 | Grad Max: 0.010322 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023582 | Grad Max: 0.120438 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000367 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004915 | Grad Max: 0.010002 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000231 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001284 | Grad Max: 0.003205 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000788 | Grad Max: 0.002297 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021029 | Grad Max: 0.021029 [GRADIENT NORM TOTAL] 8.6286 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.964 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.509228 0.49077204] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 712/1336 | B: 648/1400 | C: 542/1506 [LOSS Ex1] A: 0.63402 | B: 0.62403 | C: 0.62211 [LOGITS Ex2 A] Mean Abs: 2.166 | Max: 6.146 [LOSS Ex2] A: 0.11928 | B: 0.33326 | C: 0.24339 ** [JOINT LOSS] ** : 0.858697 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005569 | Grad Max: 0.199196 -> Layer: shared_layers.0.bias | Grad Mean: 0.316453 | Grad Max: 1.486901 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002291 | Grad Max: 0.006211 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008093 | Grad Max: 0.008093 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002446 | Grad Max: 0.338221 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044152 | Grad Max: 1.902192 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.009507 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023736 | Grad Max: 0.111132 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000420 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005008 | Grad Max: 0.010381 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000259 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001283 | Grad Max: 0.003135 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000843 | Grad Max: 0.002079 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020909 | Grad Max: 0.020909 [GRADIENT NORM TOTAL] 7.2901 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.995 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50606596 0.49393407] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 717/1331 | B: 635/1413 | C: 556/1492 [LOSS Ex1] A: 0.63300 | B: 0.62463 | C: 0.61865 [LOGITS Ex2 A] Mean Abs: 2.096 | Max: 7.971 [LOSS Ex2] A: 0.13750 | B: 0.34829 | C: 0.24926 ** [JOINT LOSS] ** : 0.870442 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004846 | Grad Max: 0.188462 -> Layer: shared_layers.0.bias | Grad Mean: 0.324436 | Grad Max: 1.778595 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.006520 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001609 | Grad Max: 0.001609 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.523810 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035363 | Grad Max: 2.870143 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000215 | Grad Max: 0.009129 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015643 | Grad Max: 0.106349 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000259 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002745 | Grad Max: 0.006134 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000145 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000716 | Grad Max: 0.001869 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000481 | Grad Max: 0.001299 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012889 | Grad Max: 0.012889 [GRADIENT NORM TOTAL] 7.1032 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.822 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5016118 0.4983882] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.075 [MASKS] A(Pass/Fail): 685/1363 | B: 638/1410 | C: 548/1500 [LOSS Ex1] A: 0.64159 | B: 0.62024 | C: 0.62264 [LOGITS Ex2 A] Mean Abs: 2.073 | Max: 6.219 [LOSS Ex2] A: 0.10899 | B: 0.32874 | C: 0.23830 ** [JOINT LOSS] ** : 0.853502 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003507 | Grad Max: 0.121650 -> Layer: shared_layers.0.bias | Grad Mean: 0.369036 | Grad Max: 1.650256 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005408 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003240 | Grad Max: 0.003240 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.275986 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041024 | Grad Max: 1.556629 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.012191 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021248 | Grad Max: 0.137525 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000375 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004058 | Grad Max: 0.009494 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000191 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001021 | Grad Max: 0.002787 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000668 | Grad Max: 0.001964 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016445 | Grad Max: 0.016445 [GRADIENT NORM TOTAL] 7.4089 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.740 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54393584 0.45606413] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.072 [MASKS] A(Pass/Fail): 683/1365 | B: 600/1256 | C: 368/1008 [LOSS Ex1] A: 0.64187 | B: 0.62457 | C: 0.62154 [LOGITS Ex2 A] Mean Abs: 2.091 | Max: 6.176 [LOSS Ex2] A: 0.11597 | B: 0.31625 | C: 0.23541 ** [JOINT LOSS] ** : 0.851871 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.071276 -> Layer: shared_layers.0.bias | Grad Mean: 0.107327 | Grad Max: 0.495634 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.006083 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002677 | Grad Max: 0.002677 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000816 | Grad Max: 0.208021 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014138 | Grad Max: 1.137867 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.003971 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002475 | Grad Max: 0.024599 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000121 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000381 | Grad Max: 0.002297 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000064 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000094 | Grad Max: 0.000610 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001138 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000119 | Grad Max: 0.000119 [GRADIENT NORM TOTAL] 3.0116 [EPOCH SUMMARY] Train Loss: 0.8586 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8412 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8421 -> New: 0.8412) ############################## EPOCH 137/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.925 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.77480894 0.22519103] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.076 [MASKS] A(Pass/Fail): 745/1303 | B: 649/1399 | C: 533/1515 [LOSS Ex1] A: 0.63567 | B: 0.62387 | C: 0.62554 [LOGITS Ex2 A] Mean Abs: 2.149 | Max: 6.392 [LOSS Ex2] A: 0.11865 | B: 0.34101 | C: 0.24559 ** [JOINT LOSS] ** : 0.863443 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007277 | Grad Max: 0.186925 -> Layer: shared_layers.0.bias | Grad Mean: 0.472116 | Grad Max: 2.158170 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005586 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002355 | Grad Max: 0.002355 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003101 | Grad Max: 0.483135 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057197 | Grad Max: 2.692997 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000400 | Grad Max: 0.015054 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030643 | Grad Max: 0.175668 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000570 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006295 | Grad Max: 0.014117 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000324 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001589 | Grad Max: 0.004309 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000999 | Grad Max: 0.002308 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024726 | Grad Max: 0.024726 [GRADIENT NORM TOTAL] 10.3127 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 1.026 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005094 0.49949065] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 718/1330 | B: 636/1412 | C: 546/1502 [LOSS Ex1] A: 0.64251 | B: 0.62447 | C: 0.61910 [LOGITS Ex2 A] Mean Abs: 2.177 | Max: 5.784 [LOSS Ex2] A: 0.10518 | B: 0.34171 | C: 0.24577 ** [JOINT LOSS] ** : 0.859579 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003414 | Grad Max: 0.107920 -> Layer: shared_layers.0.bias | Grad Mean: 0.289460 | Grad Max: 1.505160 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.005647 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002844 | Grad Max: 0.002844 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.389044 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038648 | Grad Max: 2.193226 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000238 | Grad Max: 0.011278 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018139 | Grad Max: 0.111627 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000303 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003482 | Grad Max: 0.008363 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000158 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000859 | Grad Max: 0.002354 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000459 | Grad Max: 0.001627 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012375 | Grad Max: 0.012375 [GRADIENT NORM TOTAL] 7.1690 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.721 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7130187 0.2869813] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.076 [MASKS] A(Pass/Fail): 714/1334 | B: 638/1410 | C: 553/1495 [LOSS Ex1] A: 0.63812 | B: 0.62008 | C: 0.61820 [LOGITS Ex2 A] Mean Abs: 2.110 | Max: 6.003 [LOSS Ex2] A: 0.13333 | B: 0.32058 | C: 0.21918 ** [JOINT LOSS] ** : 0.849826 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005918 | Grad Max: 0.181300 -> Layer: shared_layers.0.bias | Grad Mean: 0.289124 | Grad Max: 1.674311 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002227 | Grad Max: 0.005769 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003614 | Grad Max: 0.003614 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001925 | Grad Max: 0.389537 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034513 | Grad Max: 2.194812 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000236 | Grad Max: 0.007972 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018073 | Grad Max: 0.091532 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000365 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003824 | Grad Max: 0.008015 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000199 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000954 | Grad Max: 0.002736 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000665 | Grad Max: 0.002102 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014809 | Grad Max: 0.014809 [GRADIENT NORM TOTAL] 6.2368 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.832 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6286566 0.3713434] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.076 [MASKS] A(Pass/Fail): 589/1027 | B: 600/1256 | C: 580/1468 [LOSS Ex1] A: 0.63633 | B: 0.62441 | C: 0.61361 [LOGITS Ex2 A] Mean Abs: 2.173 | Max: 7.511 [LOSS Ex2] A: 0.11632 | B: 0.32081 | C: 0.23798 ** [JOINT LOSS] ** : 0.849824 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006669 | Grad Max: 0.206724 -> Layer: shared_layers.0.bias | Grad Mean: 0.429852 | Grad Max: 2.039979 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.006222 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005552 | Grad Max: 0.005552 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002661 | Grad Max: 0.601511 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048290 | Grad Max: 3.350161 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000315 | Grad Max: 0.012347 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024467 | Grad Max: 0.134525 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000448 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005166 | Grad Max: 0.010644 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000241 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001306 | Grad Max: 0.003127 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000893 | Grad Max: 0.002086 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021478 | Grad Max: 0.021478 [GRADIENT NORM TOTAL] 9.5909 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.028 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076454 0.4923546] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.077 [MASKS] A(Pass/Fail): 717/1331 | B: 649/1399 | C: 610/1438 [LOSS Ex1] A: 0.63700 | B: 0.62372 | C: 0.61312 [LOGITS Ex2 A] Mean Abs: 2.175 | Max: 8.167 [LOSS Ex2] A: 0.10954 | B: 0.33538 | C: 0.24061 ** [JOINT LOSS] ** : 0.853118 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003081 | Grad Max: 0.126180 -> Layer: shared_layers.0.bias | Grad Mean: 0.149661 | Grad Max: 0.741012 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006771 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005272 | Grad Max: 0.005272 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001273 | Grad Max: 0.363780 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022433 | Grad Max: 2.047498 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000107 | Grad Max: 0.004430 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007573 | Grad Max: 0.054724 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000148 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001285 | Grad Max: 0.004133 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000094 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000319 | Grad Max: 0.001185 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000336 | Grad Max: 0.001045 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004561 | Grad Max: 0.004561 [GRADIENT NORM TOTAL] 4.2451 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.970 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50918984 0.49081016] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.077 [MASKS] A(Pass/Fail): 712/1336 | B: 636/1412 | C: 582/1466 [LOSS Ex1] A: 0.63380 | B: 0.62432 | C: 0.62042 [LOGITS Ex2 A] Mean Abs: 2.193 | Max: 5.908 [LOSS Ex2] A: 0.11244 | B: 0.33998 | C: 0.23494 ** [JOINT LOSS] ** : 0.855297 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003572 | Grad Max: 0.090099 -> Layer: shared_layers.0.bias | Grad Mean: 0.274720 | Grad Max: 1.140549 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.006753 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001644 | Grad Max: 0.001644 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001888 | Grad Max: 0.312162 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034976 | Grad Max: 1.741137 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.007604 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017224 | Grad Max: 0.091453 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000304 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003428 | Grad Max: 0.008214 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000162 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000836 | Grad Max: 0.002320 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000491 | Grad Max: 0.001412 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012362 | Grad Max: 0.012362 [GRADIENT NORM TOTAL] 6.2191 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 1.000 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5061461 0.49385396] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 717/1331 | B: 638/1410 | C: 560/1488 [LOSS Ex1] A: 0.63279 | B: 0.61992 | C: 0.61685 [LOGITS Ex2 A] Mean Abs: 2.115 | Max: 6.502 [LOSS Ex2] A: 0.12662 | B: 0.32286 | C: 0.23367 ** [JOINT LOSS] ** : 0.850900 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003398 | Grad Max: 0.076180 -> Layer: shared_layers.0.bias | Grad Mean: 0.195642 | Grad Max: 0.991908 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002333 | Grad Max: 0.006174 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004663 | Grad Max: 0.004663 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001290 | Grad Max: 0.587768 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022611 | Grad Max: 3.269629 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.005201 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007312 | Grad Max: 0.060499 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000194 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001386 | Grad Max: 0.004289 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000086 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000376 | Grad Max: 0.001197 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000348 | Grad Max: 0.001450 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007640 | Grad Max: 0.007640 [GRADIENT NORM TOTAL] 5.5504 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.826 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50159836 0.49840164] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.075 [MASKS] A(Pass/Fail): 685/1363 | B: 600/1256 | C: 556/1492 [LOSS Ex1] A: 0.64139 | B: 0.62425 | C: 0.61526 [LOGITS Ex2 A] Mean Abs: 2.102 | Max: 5.867 [LOSS Ex2] A: 0.11309 | B: 0.33024 | C: 0.21693 ** [JOINT LOSS] ** : 0.847055 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003083 | Grad Max: 0.106989 -> Layer: shared_layers.0.bias | Grad Mean: 0.324086 | Grad Max: 1.342922 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.005793 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006150 | Grad Max: 0.006150 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.327135 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042066 | Grad Max: 1.839834 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000280 | Grad Max: 0.012671 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022063 | Grad Max: 0.145535 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000342 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004364 | Grad Max: 0.009722 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000235 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001095 | Grad Max: 0.003269 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000705 | Grad Max: 0.002122 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017673 | Grad Max: 0.017673 [GRADIENT NORM TOTAL] 7.3099 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.743 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439039 0.45609614] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.073 [MASKS] A(Pass/Fail): 683/1365 | B: 649/1399 | C: 546/1502 [LOSS Ex1] A: 0.64168 | B: 0.62356 | C: 0.62269 [LOGITS Ex2 A] Mean Abs: 2.106 | Max: 5.640 [LOSS Ex2] A: 0.11966 | B: 0.33958 | C: 0.27049 ** [JOINT LOSS] ** : 0.872554 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004560 | Grad Max: 0.131719 -> Layer: shared_layers.0.bias | Grad Mean: 0.169226 | Grad Max: 0.784340 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005773 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009210 | Grad Max: 0.009210 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001113 | Grad Max: 0.276424 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019170 | Grad Max: 1.544752 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.003783 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007496 | Grad Max: 0.043029 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000236 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001626 | Grad Max: 0.005071 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000101 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000375 | Grad Max: 0.001243 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000221 | Grad Max: 0.000863 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004137 | Grad Max: 0.004137 [GRADIENT NORM TOTAL] 4.0438 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.929 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7758441 0.2241559] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.077 [MASKS] A(Pass/Fail): 745/1303 | B: 636/1412 | C: 551/1497 [LOSS Ex1] A: 0.63546 | B: 0.62416 | C: 0.62086 [LOGITS Ex2 A] Mean Abs: 2.166 | Max: 6.460 [LOSS Ex2] A: 0.12039 | B: 0.33985 | C: 0.21902 ** [JOINT LOSS] ** : 0.853247 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004766 | Grad Max: 0.159686 -> Layer: shared_layers.0.bias | Grad Mean: 0.282445 | Grad Max: 1.420996 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.006288 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006652 | Grad Max: 0.006652 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001892 | Grad Max: 0.406144 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033760 | Grad Max: 2.272421 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000213 | Grad Max: 0.006708 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016445 | Grad Max: 0.078791 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000325 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003531 | Grad Max: 0.008226 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000164 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000894 | Grad Max: 0.002310 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000559 | Grad Max: 0.002040 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014141 | Grad Max: 0.014141 [GRADIENT NORM TOTAL] 6.5322 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 1.031 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004766 0.49952343] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.075 [MASKS] A(Pass/Fail): 718/1330 | B: 639/1409 | C: 552/1496 [LOSS Ex1] A: 0.64231 | B: 0.61977 | C: 0.62185 [LOGITS Ex2 A] Mean Abs: 2.149 | Max: 6.117 [LOSS Ex2] A: 0.10373 | B: 0.31953 | C: 0.25016 ** [JOINT LOSS] ** : 0.852449 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003519 | Grad Max: 0.090564 -> Layer: shared_layers.0.bias | Grad Mean: 0.205184 | Grad Max: 0.990398 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005414 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000734 | Grad Max: 0.000734 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001371 | Grad Max: 0.384083 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024709 | Grad Max: 2.164959 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000155 | Grad Max: 0.006727 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011906 | Grad Max: 0.076674 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000273 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002429 | Grad Max: 0.005865 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000124 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000605 | Grad Max: 0.001722 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000476 | Grad Max: 0.001614 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010573 | Grad Max: 0.010573 [GRADIENT NORM TOTAL] 4.8816 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.725 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.71366674 0.2863333 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.077 [MASKS] A(Pass/Fail): 714/1334 | B: 600/1256 | C: 571/1477 [LOSS Ex1] A: 0.63791 | B: 0.62410 | C: 0.61275 [LOGITS Ex2 A] Mean Abs: 2.126 | Max: 6.887 [LOSS Ex2] A: 0.12326 | B: 0.31852 | C: 0.22229 ** [JOINT LOSS] ** : 0.846273 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003654 | Grad Max: 0.106350 -> Layer: shared_layers.0.bias | Grad Mean: 0.172157 | Grad Max: 0.875789 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.006185 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000322 | Grad Max: 0.000322 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001150 | Grad Max: 0.149197 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020491 | Grad Max: 0.831694 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000120 | Grad Max: 0.004877 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009228 | Grad Max: 0.055346 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000230 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001900 | Grad Max: 0.005353 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000125 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000462 | Grad Max: 0.001564 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000387 | Grad Max: 0.001532 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007420 | Grad Max: 0.007420 [GRADIENT NORM TOTAL] 3.5181 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.837 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62893605 0.371064 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.077 [MASKS] A(Pass/Fail): 589/1027 | B: 649/1399 | C: 550/1498 [LOSS Ex1] A: 0.63611 | B: 0.62341 | C: 0.61919 [LOGITS Ex2 A] Mean Abs: 2.199 | Max: 6.337 [LOSS Ex2] A: 0.11522 | B: 0.33537 | C: 0.22686 ** [JOINT LOSS] ** : 0.852055 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003157 | Grad Max: 0.138840 -> Layer: shared_layers.0.bias | Grad Mean: 0.368306 | Grad Max: 1.769218 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002174 | Grad Max: 0.005873 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002076 | Grad Max: 0.002076 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002426 | Grad Max: 0.338023 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044898 | Grad Max: 1.899609 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.011757 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024577 | Grad Max: 0.140828 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000449 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004888 | Grad Max: 0.010275 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000200 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001231 | Grad Max: 0.003080 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000717 | Grad Max: 0.002023 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019295 | Grad Max: 0.019295 [GRADIENT NORM TOTAL] 8.0605 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.032 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076932 0.4923068] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.077 [MASKS] A(Pass/Fail): 717/1331 | B: 636/1412 | C: 377/999 [LOSS Ex1] A: 0.63678 | B: 0.62401 | C: 0.61932 [LOGITS Ex2 A] Mean Abs: 2.181 | Max: 8.456 [LOSS Ex2] A: 0.11654 | B: 0.34006 | C: 0.26308 ** [JOINT LOSS] ** : 0.866596 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003732 | Grad Max: 0.113706 -> Layer: shared_layers.0.bias | Grad Mean: 0.364956 | Grad Max: 1.536005 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.005452 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005239 | Grad Max: 0.005239 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002519 | Grad Max: 0.399716 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045968 | Grad Max: 2.228642 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000290 | Grad Max: 0.011636 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022927 | Grad Max: 0.125768 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000401 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004504 | Grad Max: 0.010382 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000211 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001097 | Grad Max: 0.002856 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000611 | Grad Max: 0.001648 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016398 | Grad Max: 0.016398 [GRADIENT NORM TOTAL] 8.4874 [EPOCH SUMMARY] Train Loss: 0.8552 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8365 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8412 -> New: 0.8365) ############################## EPOCH 138/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.974 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50907546 0.49092454] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.077 [MASKS] A(Pass/Fail): 712/1336 | B: 639/1409 | C: 580/1468 [LOSS Ex1] A: 0.63358 | B: 0.61961 | C: 0.61211 [LOGITS Ex2 A] Mean Abs: 2.156 | Max: 6.085 [LOSS Ex2] A: 0.11357 | B: 0.31548 | C: 0.20640 ** [JOINT LOSS] ** : 0.833586 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002333 | Grad Max: 0.065052 -> Layer: shared_layers.0.bias | Grad Mean: 0.169276 | Grad Max: 0.929979 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002401 | Grad Max: 0.006403 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006325 | Grad Max: 0.006325 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001164 | Grad Max: 0.451354 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020870 | Grad Max: 2.514894 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000102 | Grad Max: 0.004267 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008107 | Grad Max: 0.047648 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001734 | Grad Max: 0.004782 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000122 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000422 | Grad Max: 0.001635 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000442 | Grad Max: 0.001593 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006553 | Grad Max: 0.006553 [GRADIENT NORM TOTAL] 4.7865 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.004 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062797 0.49372026] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 717/1331 | B: 600/1256 | C: 575/1473 [LOSS Ex1] A: 0.63257 | B: 0.62394 | C: 0.61558 [LOGITS Ex2 A] Mean Abs: 2.109 | Max: 6.381 [LOSS Ex2] A: 0.13301 | B: 0.32750 | C: 0.24628 ** [JOINT LOSS] ** : 0.859628 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003310 | Grad Max: 0.125029 -> Layer: shared_layers.0.bias | Grad Mean: 0.319880 | Grad Max: 1.637002 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006619 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002541 | Grad Max: 0.002541 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.436214 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038368 | Grad Max: 2.435384 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000244 | Grad Max: 0.009399 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019267 | Grad Max: 0.107404 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000380 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003819 | Grad Max: 0.008842 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000171 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000992 | Grad Max: 0.002441 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000689 | Grad Max: 0.001666 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017753 | Grad Max: 0.017753 [GRADIENT NORM TOTAL] 7.4321 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.830 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015971 0.49840286] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.075 [MASKS] A(Pass/Fail): 685/1363 | B: 649/1399 | C: 549/1499 [LOSS Ex1] A: 0.64118 | B: 0.62326 | C: 0.61948 [LOGITS Ex2 A] Mean Abs: 2.103 | Max: 5.598 [LOSS Ex2] A: 0.11039 | B: 0.33704 | C: 0.22378 ** [JOINT LOSS] ** : 0.851712 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001818 | Grad Max: 0.060862 -> Layer: shared_layers.0.bias | Grad Mean: 0.104461 | Grad Max: 0.540215 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002113 | Grad Max: 0.005497 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000208 | Grad Max: 0.000208 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000772 | Grad Max: 0.302267 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013647 | Grad Max: 1.692483 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.002608 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002614 | Grad Max: 0.021020 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000155 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000447 | Grad Max: 0.002883 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000106 | Grad Max: 0.000617 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000419 | Grad Max: 0.001175 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001508 | Grad Max: 0.001508 [GRADIENT NORM TOTAL] 3.3307 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.125 | Max: 0.747 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438777 0.45612225] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.073 [MASKS] A(Pass/Fail): 684/1364 | B: 636/1412 | C: 585/1463 [LOSS Ex1] A: 0.64148 | B: 0.62386 | C: 0.61851 [LOGITS Ex2 A] Mean Abs: 2.121 | Max: 5.697 [LOSS Ex2] A: 0.12183 | B: 0.33495 | C: 0.24669 ** [JOINT LOSS] ** : 0.862439 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004202 | Grad Max: 0.177966 -> Layer: shared_layers.0.bias | Grad Mean: 0.450845 | Grad Max: 2.282545 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.005954 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009722 | Grad Max: 0.009722 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002809 | Grad Max: 0.460598 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051993 | Grad Max: 2.577108 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000312 | Grad Max: 0.010950 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024609 | Grad Max: 0.137967 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000422 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004934 | Grad Max: 0.010851 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000218 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001244 | Grad Max: 0.003183 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000776 | Grad Max: 0.001945 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020295 | Grad Max: 0.020295 [GRADIENT NORM TOTAL] 9.9252 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.934 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7768577 0.22314233] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.077 [MASKS] A(Pass/Fail): 745/1303 | B: 639/1409 | C: 584/1464 [LOSS Ex1] A: 0.63526 | B: 0.61946 | C: 0.61706 [LOGITS Ex2 A] Mean Abs: 2.155 | Max: 5.909 [LOSS Ex2] A: 0.11868 | B: 0.32065 | C: 0.24907 ** [JOINT LOSS] ** : 0.853395 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004580 | Grad Max: 0.136789 -> Layer: shared_layers.0.bias | Grad Mean: 0.322345 | Grad Max: 1.827062 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002276 | Grad Max: 0.005691 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003593 | Grad Max: 0.003593 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.370891 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036335 | Grad Max: 2.079409 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000207 | Grad Max: 0.008967 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016119 | Grad Max: 0.107147 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000331 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003274 | Grad Max: 0.007984 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000174 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000782 | Grad Max: 0.002302 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000401 | Grad Max: 0.001351 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010754 | Grad Max: 0.010754 [GRADIENT NORM TOTAL] 7.2014 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 1.035 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004848 0.49951515] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.076 [MASKS] A(Pass/Fail): 718/1330 | B: 600/1256 | C: 596/1452 [LOSS Ex1] A: 0.64212 | B: 0.62379 | C: 0.61787 [LOGITS Ex2 A] Mean Abs: 2.144 | Max: 5.939 [LOSS Ex2] A: 0.11107 | B: 0.32434 | C: 0.24089 ** [JOINT LOSS] ** : 0.853363 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004986 | Grad Max: 0.135589 -> Layer: shared_layers.0.bias | Grad Mean: 0.293427 | Grad Max: 1.244066 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.005614 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001264 | Grad Max: 0.001264 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.221248 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033826 | Grad Max: 1.238881 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000235 | Grad Max: 0.008076 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018427 | Grad Max: 0.095651 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000330 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003890 | Grad Max: 0.008407 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000201 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000988 | Grad Max: 0.002637 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000684 | Grad Max: 0.001895 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016631 | Grad Max: 0.016631 [GRADIENT NORM TOTAL] 5.8468 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.728 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.71434695 0.28565305] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.077 [MASKS] A(Pass/Fail): 714/1334 | B: 650/1398 | C: 522/1526 [LOSS Ex1] A: 0.63771 | B: 0.62312 | C: 0.62267 [LOGITS Ex2 A] Mean Abs: 2.132 | Max: 6.479 [LOSS Ex2] A: 0.12829 | B: 0.34528 | C: 0.23903 ** [JOINT LOSS] ** : 0.865365 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003760 | Grad Max: 0.089681 -> Layer: shared_layers.0.bias | Grad Mean: 0.252089 | Grad Max: 1.163138 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.005945 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005151 | Grad Max: 0.005151 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001623 | Grad Max: 0.204726 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029346 | Grad Max: 1.141874 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000178 | Grad Max: 0.007717 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013728 | Grad Max: 0.073369 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000268 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002815 | Grad Max: 0.006843 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000141 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000709 | Grad Max: 0.001973 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000485 | Grad Max: 0.001604 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011809 | Grad Max: 0.011809 [GRADIENT NORM TOTAL] 5.4482 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.841 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6292169 0.3707831] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.077 [MASKS] A(Pass/Fail): 589/1027 | B: 636/1412 | C: 547/1501 [LOSS Ex1] A: 0.63592 | B: 0.62371 | C: 0.61941 [LOGITS Ex2 A] Mean Abs: 2.222 | Max: 7.044 [LOSS Ex2] A: 0.11281 | B: 0.33903 | C: 0.24247 ** [JOINT LOSS] ** : 0.857786 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004432 | Grad Max: 0.157784 -> Layer: shared_layers.0.bias | Grad Mean: 0.377136 | Grad Max: 1.919337 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.006127 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004616 | Grad Max: 0.004616 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002523 | Grad Max: 0.435848 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046947 | Grad Max: 2.430699 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000296 | Grad Max: 0.011801 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023253 | Grad Max: 0.146079 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000385 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004587 | Grad Max: 0.009907 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000224 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001144 | Grad Max: 0.003213 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000647 | Grad Max: 0.001903 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017459 | Grad Max: 0.017459 [GRADIENT NORM TOTAL] 8.5438 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.037 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50763553 0.49236444] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.077 [MASKS] A(Pass/Fail): 718/1330 | B: 639/1409 | C: 539/1509 [LOSS Ex1] A: 0.63658 | B: 0.61932 | C: 0.61736 [LOGITS Ex2 A] Mean Abs: 2.199 | Max: 8.061 [LOSS Ex2] A: 0.11638 | B: 0.31211 | C: 0.22783 ** [JOINT LOSS] ** : 0.843194 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004325 | Grad Max: 0.182558 -> Layer: shared_layers.0.bias | Grad Mean: 0.437227 | Grad Max: 2.230555 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002263 | Grad Max: 0.005610 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000717 | Grad Max: 0.000717 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002764 | Grad Max: 0.488119 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051504 | Grad Max: 2.716687 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000329 | Grad Max: 0.010673 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026068 | Grad Max: 0.146794 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000401 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005239 | Grad Max: 0.010774 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000221 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001353 | Grad Max: 0.003306 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000837 | Grad Max: 0.002336 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022814 | Grad Max: 0.022814 [GRADIENT NORM TOTAL] 9.6704 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.979 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50900936 0.49099064] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.077 [MASKS] A(Pass/Fail): 712/1336 | B: 600/1256 | C: 570/1478 [LOSS Ex1] A: 0.63338 | B: 0.62364 | C: 0.62072 [LOGITS Ex2 A] Mean Abs: 2.160 | Max: 6.276 [LOSS Ex2] A: 0.11742 | B: 0.31405 | C: 0.24510 ** [JOINT LOSS] ** : 0.851435 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003642 | Grad Max: 0.123614 -> Layer: shared_layers.0.bias | Grad Mean: 0.203501 | Grad Max: 0.987507 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002140 | Grad Max: 0.006332 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001091 | Grad Max: 0.001091 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001319 | Grad Max: 0.517117 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023298 | Grad Max: 2.898510 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000124 | Grad Max: 0.004717 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009680 | Grad Max: 0.053948 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000225 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001856 | Grad Max: 0.005071 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000119 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000454 | Grad Max: 0.001490 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000332 | Grad Max: 0.001169 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007313 | Grad Max: 0.007313 [GRADIENT NORM TOTAL] 5.6417 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.009 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50635856 0.49364147] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.077 [MASKS] A(Pass/Fail): 717/1331 | B: 650/1398 | C: 536/1512 [LOSS Ex1] A: 0.63237 | B: 0.62298 | C: 0.62280 [LOGITS Ex2 A] Mean Abs: 2.148 | Max: 6.619 [LOSS Ex2] A: 0.12195 | B: 0.33814 | C: 0.23896 ** [JOINT LOSS] ** : 0.859062 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003451 | Grad Max: 0.107396 -> Layer: shared_layers.0.bias | Grad Mean: 0.266407 | Grad Max: 1.339336 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.006378 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001819 | Grad Max: 0.001819 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001794 | Grad Max: 0.214231 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031957 | Grad Max: 1.175049 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000181 | Grad Max: 0.008649 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014041 | Grad Max: 0.093654 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000273 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002589 | Grad Max: 0.006702 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000153 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000654 | Grad Max: 0.002001 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000368 | Grad Max: 0.001376 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010658 | Grad Max: 0.010658 [GRADIENT NORM TOTAL] 5.8573 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.833 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.501528 0.49847195] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.557 | Std: 0.076 [MASKS] A(Pass/Fail): 685/1363 | B: 636/1412 | C: 557/1491 [LOSS Ex1] A: 0.64099 | B: 0.62357 | C: 0.61346 [LOGITS Ex2 A] Mean Abs: 2.133 | Max: 5.779 [LOSS Ex2] A: 0.11071 | B: 0.33622 | C: 0.24705 ** [JOINT LOSS] ** : 0.857334 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003420 | Grad Max: 0.091146 -> Layer: shared_layers.0.bias | Grad Mean: 0.153197 | Grad Max: 0.732175 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002115 | Grad Max: 0.006231 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009761 | Grad Max: 0.009761 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001027 | Grad Max: 0.456042 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018263 | Grad Max: 2.540079 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.004498 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005767 | Grad Max: 0.046473 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000189 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001102 | Grad Max: 0.004034 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000078 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000256 | Grad Max: 0.000962 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000343 | Grad Max: 0.000942 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003066 | Grad Max: 0.003066 [GRADIENT NORM TOTAL] 4.3899 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.750 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54389685 0.45610312] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.556 | Std: 0.073 [MASKS] A(Pass/Fail): 685/1363 | B: 639/1409 | C: 578/1470 [LOSS Ex1] A: 0.64129 | B: 0.61917 | C: 0.61488 [LOGITS Ex2 A] Mean Abs: 2.125 | Max: 5.793 [LOSS Ex2] A: 0.12070 | B: 0.30935 | C: 0.24238 ** [JOINT LOSS] ** : 0.849261 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002343 | Grad Max: 0.075720 -> Layer: shared_layers.0.bias | Grad Mean: 0.211473 | Grad Max: 1.004155 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.005899 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004036 | Grad Max: 0.004036 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.376713 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026425 | Grad Max: 2.105936 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000143 | Grad Max: 0.007041 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011207 | Grad Max: 0.079598 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000221 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002206 | Grad Max: 0.005437 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000109 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000562 | Grad Max: 0.001479 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000384 | Grad Max: 0.001326 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008802 | Grad Max: 0.008802 [GRADIENT NORM TOTAL] 5.2502 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.938 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.77792966 0.2220703 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.077 [MASKS] A(Pass/Fail): 746/1302 | B: 600/1256 | C: 360/1016 [LOSS Ex1] A: 0.63506 | B: 0.62349 | C: 0.61948 [LOGITS Ex2 A] Mean Abs: 2.129 | Max: 6.217 [LOSS Ex2] A: 0.10616 | B: 0.31738 | C: 0.21075 ** [JOINT LOSS] ** : 0.837442 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003705 | Grad Max: 0.093126 -> Layer: shared_layers.0.bias | Grad Mean: 0.218534 | Grad Max: 1.038931 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.005860 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004240 | Grad Max: 0.004240 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001428 | Grad Max: 0.169764 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026284 | Grad Max: 0.948099 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000181 | Grad Max: 0.007595 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014114 | Grad Max: 0.070695 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000286 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002862 | Grad Max: 0.007114 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000183 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000720 | Grad Max: 0.002533 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000535 | Grad Max: 0.002107 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011973 | Grad Max: 0.011973 [GRADIENT NORM TOTAL] 4.4398 [EPOCH SUMMARY] Train Loss: 0.8525 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8344 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8365 -> New: 0.8344) ############################## EPOCH 139/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 1.040 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004846 0.49951538] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.076 [MASKS] A(Pass/Fail): 718/1330 | B: 650/1398 | C: 556/1492 [LOSS Ex1] A: 0.64193 | B: 0.62283 | C: 0.62045 [LOGITS Ex2 A] Mean Abs: 2.149 | Max: 6.082 [LOSS Ex2] A: 0.10492 | B: 0.34044 | C: 0.22908 ** [JOINT LOSS] ** : 0.853215 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004031 | Grad Max: 0.143488 -> Layer: shared_layers.0.bias | Grad Mean: 0.142023 | Grad Max: 1.129064 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.005737 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000101 | Grad Max: 0.000101 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001000 | Grad Max: 0.217077 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016705 | Grad Max: 1.204916 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000059 | Grad Max: 0.003466 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003132 | Grad Max: 0.028526 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000142 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000479 | Grad Max: 0.002529 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000138 | Grad Max: 0.000723 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000400 | Grad Max: 0.001018 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001894 | Grad Max: 0.001894 [GRADIENT NORM TOTAL] 3.7371 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.732 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7151232 0.28487676] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.077 [MASKS] A(Pass/Fail): 714/1334 | B: 636/1412 | C: 533/1515 [LOSS Ex1] A: 0.63750 | B: 0.62342 | C: 0.62124 [LOGITS Ex2 A] Mean Abs: 2.146 | Max: 6.342 [LOSS Ex2] A: 0.12205 | B: 0.33360 | C: 0.23788 ** [JOINT LOSS] ** : 0.858561 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004245 | Grad Max: 0.125003 -> Layer: shared_layers.0.bias | Grad Mean: 0.134751 | Grad Max: 0.827644 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.005745 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001920 | Grad Max: 0.001920 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001131 | Grad Max: 0.303935 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018971 | Grad Max: 1.700392 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000078 | Grad Max: 0.004403 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005038 | Grad Max: 0.042656 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000152 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000848 | Grad Max: 0.003805 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000086 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000200 | Grad Max: 0.000779 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000348 | Grad Max: 0.000875 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002517 | Grad Max: 0.002517 [GRADIENT NORM TOTAL] 3.8628 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.846 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6295932 0.37040678] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.077 [MASKS] A(Pass/Fail): 589/1027 | B: 639/1409 | C: 584/1464 [LOSS Ex1] A: 0.63570 | B: 0.61902 | C: 0.61320 [LOGITS Ex2 A] Mean Abs: 2.185 | Max: 7.784 [LOSS Ex2] A: 0.11369 | B: 0.32192 | C: 0.22582 ** [JOINT LOSS] ** : 0.843115 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003821 | Grad Max: 0.120671 -> Layer: shared_layers.0.bias | Grad Mean: 0.190586 | Grad Max: 0.852520 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002301 | Grad Max: 0.006623 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009978 | Grad Max: 0.009978 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001054 | Grad Max: 0.574919 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018910 | Grad Max: 3.199260 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000074 | Grad Max: 0.003911 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005162 | Grad Max: 0.043527 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000188 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001026 | Grad Max: 0.004116 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000095 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000253 | Grad Max: 0.000948 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000412 | Grad Max: 0.001320 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004440 | Grad Max: 0.004440 [GRADIENT NORM TOTAL] 5.6119 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.042 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.507667 0.492333] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.078 [MASKS] A(Pass/Fail): 718/1330 | B: 600/1256 | C: 542/1506 [LOSS Ex1] A: 0.63636 | B: 0.62333 | C: 0.61893 [LOGITS Ex2 A] Mean Abs: 2.173 | Max: 7.364 [LOSS Ex2] A: 0.10969 | B: 0.30902 | C: 0.25128 ** [JOINT LOSS] ** : 0.849536 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003925 | Grad Max: 0.138538 -> Layer: shared_layers.0.bias | Grad Mean: 0.182485 | Grad Max: 0.840974 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.005696 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004192 | Grad Max: 0.004192 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001137 | Grad Max: 0.440160 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018784 | Grad Max: 2.457703 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.005234 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003942 | Grad Max: 0.045788 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000126 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000517 | Grad Max: 0.002372 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000059 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000141 | Grad Max: 0.000682 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000309 | Grad Max: 0.000943 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002480 | Grad Max: 0.002480 [GRADIENT NORM TOTAL] 5.0246 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.984 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50895447 0.4910455 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.077 [MASKS] A(Pass/Fail): 713/1335 | B: 650/1398 | C: 548/1500 [LOSS Ex1] A: 0.63315 | B: 0.62267 | C: 0.61976 [LOGITS Ex2 A] Mean Abs: 2.197 | Max: 6.273 [LOSS Ex2] A: 0.11763 | B: 0.33621 | C: 0.24114 ** [JOINT LOSS] ** : 0.856853 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006166 | Grad Max: 0.234545 -> Layer: shared_layers.0.bias | Grad Mean: 0.111724 | Grad Max: 0.877695 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.006201 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000730 | Grad Max: 0.000730 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001010 | Grad Max: 0.170496 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016354 | Grad Max: 0.939621 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.003527 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005185 | Grad Max: 0.032661 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000278 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001294 | Grad Max: 0.003964 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000129 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000311 | Grad Max: 0.001100 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000294 | Grad Max: 0.001057 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004520 | Grad Max: 0.004520 [GRADIENT NORM TOTAL] 3.0691 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.014 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.506485 0.49351504] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.077 [MASKS] A(Pass/Fail): 717/1331 | B: 636/1412 | C: 566/1482 [LOSS Ex1] A: 0.63214 | B: 0.62325 | C: 0.62014 [LOGITS Ex2 A] Mean Abs: 2.157 | Max: 7.295 [LOSS Ex2] A: 0.12709 | B: 0.33031 | C: 0.23602 ** [JOINT LOSS] ** : 0.856311 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004235 | Grad Max: 0.134149 -> Layer: shared_layers.0.bias | Grad Mean: 0.125570 | Grad Max: 0.740954 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.006527 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000194 | Grad Max: 0.000194 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000974 | Grad Max: 0.215637 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016034 | Grad Max: 1.212048 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.002544 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002336 | Grad Max: 0.023016 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000189 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000329 | Grad Max: 0.002429 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000061 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000090 | Grad Max: 0.000567 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000339 | Grad Max: 0.000950 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000527 | Grad Max: 0.000527 [GRADIENT NORM TOTAL] 3.3813 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.838 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5015316 0.4984684] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.076 [MASKS] A(Pass/Fail): 687/1361 | B: 639/1409 | C: 590/1458 [LOSS Ex1] A: 0.64076 | B: 0.61884 | C: 0.61376 [LOGITS Ex2 A] Mean Abs: 2.142 | Max: 6.558 [LOSS Ex2] A: 0.11201 | B: 0.31296 | C: 0.22204 ** [JOINT LOSS] ** : 0.840123 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004197 | Grad Max: 0.206322 -> Layer: shared_layers.0.bias | Grad Mean: 0.084214 | Grad Max: 0.350190 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.005896 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007876 | Grad Max: 0.007876 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001047 | Grad Max: 0.146818 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017053 | Grad Max: 0.818219 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000097 | Grad Max: 0.003480 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006979 | Grad Max: 0.034961 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000224 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001664 | Grad Max: 0.004710 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000113 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000407 | Grad Max: 0.001406 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000473 | Grad Max: 0.001640 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006646 | Grad Max: 0.006646 [GRADIENT NORM TOTAL] 2.8015 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.126 | Max: 0.754 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54387975 0.45612025] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.073 [MASKS] A(Pass/Fail): 685/1363 | B: 600/1256 | C: 569/1479 [LOSS Ex1] A: 0.64107 | B: 0.62314 | C: 0.61406 [LOGITS Ex2 A] Mean Abs: 2.158 | Max: 5.935 [LOSS Ex2] A: 0.12351 | B: 0.31500 | C: 0.21692 ** [JOINT LOSS] ** : 0.844566 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004034 | Grad Max: 0.124453 -> Layer: shared_layers.0.bias | Grad Mean: 0.214177 | Grad Max: 1.068504 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.005810 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000308 | Grad Max: 0.000308 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001618 | Grad Max: 0.431269 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028355 | Grad Max: 2.415372 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000161 | Grad Max: 0.006791 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012317 | Grad Max: 0.085344 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000217 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002211 | Grad Max: 0.005690 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000116 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000583 | Grad Max: 0.001682 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000435 | Grad Max: 0.001398 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010063 | Grad Max: 0.010063 [GRADIENT NORM TOTAL] 5.5701 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.943 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.77918535 0.22081466] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.077 [MASKS] A(Pass/Fail): 746/1302 | B: 650/1398 | C: 602/1446 [LOSS Ex1] A: 0.63482 | B: 0.62248 | C: 0.61417 [LOGITS Ex2 A] Mean Abs: 2.190 | Max: 7.447 [LOSS Ex2] A: 0.10840 | B: 0.32315 | C: 0.22173 ** [JOINT LOSS] ** : 0.841582 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003602 | Grad Max: 0.117547 -> Layer: shared_layers.0.bias | Grad Mean: 0.112271 | Grad Max: 0.612127 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002258 | Grad Max: 0.006461 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002176 | Grad Max: 0.002176 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000870 | Grad Max: 0.331717 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014135 | Grad Max: 1.859245 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000053 | Grad Max: 0.002867 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002248 | Grad Max: 0.027670 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000162 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000355 | Grad Max: 0.002524 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000056 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000089 | Grad Max: 0.000549 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000407 | Grad Max: 0.001021 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000728 | Grad Max: 0.000728 [GRADIENT NORM TOTAL] 3.3935 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 1.046 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004692 0.49953073] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.076 [MASKS] A(Pass/Fail): 718/1330 | B: 636/1412 | C: 562/1486 [LOSS Ex1] A: 0.64169 | B: 0.62305 | C: 0.62071 [LOGITS Ex2 A] Mean Abs: 2.199 | Max: 6.177 [LOSS Ex2] A: 0.10550 | B: 0.33012 | C: 0.23295 ** [JOINT LOSS] ** : 0.851343 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003489 | Grad Max: 0.126152 -> Layer: shared_layers.0.bias | Grad Mean: 0.098808 | Grad Max: 0.622250 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.005521 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000637 | Grad Max: 0.000637 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000750 | Grad Max: 0.171783 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012183 | Grad Max: 0.963763 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.003939 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003215 | Grad Max: 0.030988 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000190 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000701 | Grad Max: 0.003059 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000076 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000171 | Grad Max: 0.000853 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000357 | Grad Max: 0.001424 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004047 | Grad Max: 0.004047 [GRADIENT NORM TOTAL] 2.5696 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.737 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7159942 0.28400576] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.077 [MASKS] A(Pass/Fail): 714/1334 | B: 639/1409 | C: 566/1482 [LOSS Ex1] A: 0.63724 | B: 0.61863 | C: 0.61384 [LOGITS Ex2 A] Mean Abs: 2.190 | Max: 6.254 [LOSS Ex2] A: 0.12624 | B: 0.31470 | C: 0.22872 ** [JOINT LOSS] ** : 0.846453 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002918 | Grad Max: 0.093407 -> Layer: shared_layers.0.bias | Grad Mean: 0.116116 | Grad Max: 0.549032 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002232 | Grad Max: 0.005990 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000367 | Grad Max: 0.000367 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000956 | Grad Max: 0.438946 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016518 | Grad Max: 2.449485 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.006042 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005623 | Grad Max: 0.065622 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000157 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001244 | Grad Max: 0.004208 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000096 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000331 | Grad Max: 0.001037 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000350 | Grad Max: 0.001077 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005041 | Grad Max: 0.005041 [GRADIENT NORM TOTAL] 4.0476 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.851 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.62994856 0.37005144] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.078 [MASKS] A(Pass/Fail): 589/1027 | B: 600/1256 | C: 554/1494 [LOSS Ex1] A: 0.63543 | B: 0.62293 | C: 0.61789 [LOGITS Ex2 A] Mean Abs: 2.222 | Max: 8.235 [LOSS Ex2] A: 0.11773 | B: 0.31721 | C: 0.24296 ** [JOINT LOSS] ** : 0.851383 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003952 | Grad Max: 0.104058 -> Layer: shared_layers.0.bias | Grad Mean: 0.168998 | Grad Max: 0.767948 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002147 | Grad Max: 0.006194 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010402 | Grad Max: 0.010402 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001215 | Grad Max: 0.512284 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021062 | Grad Max: 2.844548 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.005184 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007813 | Grad Max: 0.057298 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000195 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001300 | Grad Max: 0.004061 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000097 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000327 | Grad Max: 0.001131 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000263 | Grad Max: 0.001056 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006088 | Grad Max: 0.006088 [GRADIENT NORM TOTAL] 4.7346 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.049 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50768834 0.49231163] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.078 [MASKS] A(Pass/Fail): 717/1331 | B: 650/1398 | C: 593/1455 [LOSS Ex1] A: 0.63609 | B: 0.62226 | C: 0.60985 [LOGITS Ex2 A] Mean Abs: 2.209 | Max: 7.571 [LOSS Ex2] A: 0.11146 | B: 0.33009 | C: 0.23878 ** [JOINT LOSS] ** : 0.849507 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005933 | Grad Max: 0.268127 -> Layer: shared_layers.0.bias | Grad Mean: 0.150234 | Grad Max: 1.285709 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.006195 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002741 | Grad Max: 0.002741 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001262 | Grad Max: 0.162715 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020891 | Grad Max: 0.897498 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000116 | Grad Max: 0.005290 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007870 | Grad Max: 0.053138 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000279 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001851 | Grad Max: 0.005007 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000122 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000451 | Grad Max: 0.001471 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000322 | Grad Max: 0.001225 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006622 | Grad Max: 0.006622 [GRADIENT NORM TOTAL] 3.7747 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.990 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5088788 0.49112117] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.078 [MASKS] A(Pass/Fail): 713/1335 | B: 637/1411 | C: 324/1052 [LOSS Ex1] A: 0.63287 | B: 0.62284 | C: 0.62856 [LOGITS Ex2 A] Mean Abs: 2.200 | Max: 6.352 [LOSS Ex2] A: 0.11230 | B: 0.33043 | C: 0.23014 ** [JOINT LOSS] ** : 0.852378 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002883 | Grad Max: 0.094970 -> Layer: shared_layers.0.bias | Grad Mean: 0.071526 | Grad Max: 0.332434 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.005989 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001177 | Grad Max: 0.001177 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000813 | Grad Max: 0.149971 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013582 | Grad Max: 0.823411 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.003928 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003853 | Grad Max: 0.036735 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000138 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000579 | Grad Max: 0.002913 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000076 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000165 | Grad Max: 0.000808 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000326 | Grad Max: 0.001150 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003642 | Grad Max: 0.003642 [GRADIENT NORM TOTAL] 2.3384 [EPOCH SUMMARY] Train Loss: 0.8496 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8312 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8344 -> New: 0.8312) ############################## EPOCH 140/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.021 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5065903 0.49340966] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.077 [MASKS] A(Pass/Fail): 717/1331 | B: 639/1409 | C: 549/1499 [LOSS Ex1] A: 0.63185 | B: 0.61841 | C: 0.62371 [LOGITS Ex2 A] Mean Abs: 2.200 | Max: 6.199 [LOSS Ex2] A: 0.12436 | B: 0.31335 | C: 0.24588 ** [JOINT LOSS] ** : 0.852522 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002557 | Grad Max: 0.081023 -> Layer: shared_layers.0.bias | Grad Mean: 0.136952 | Grad Max: 0.933232 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.005628 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001269 | Grad Max: 0.001269 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001033 | Grad Max: 0.319262 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018289 | Grad Max: 1.789287 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000066 | Grad Max: 0.003413 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004780 | Grad Max: 0.032133 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000180 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000909 | Grad Max: 0.003650 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000081 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000188 | Grad Max: 0.000934 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000236 | Grad Max: 0.000745 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001328 | Grad Max: 0.001328 [GRADIENT NORM TOTAL] 4.0768 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.844 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50149477 0.4985053 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.076 [MASKS] A(Pass/Fail): 689/1359 | B: 600/1256 | C: 581/1467 [LOSS Ex1] A: 0.64049 | B: 0.62271 | C: 0.61743 [LOGITS Ex2 A] Mean Abs: 2.182 | Max: 7.962 [LOSS Ex2] A: 0.11912 | B: 0.31035 | C: 0.22272 ** [JOINT LOSS] ** : 0.844273 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007425 | Grad Max: 0.308850 -> Layer: shared_layers.0.bias | Grad Mean: 0.329574 | Grad Max: 1.441409 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.006352 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003586 | Grad Max: 0.003586 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002403 | Grad Max: 0.522059 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042899 | Grad Max: 2.934866 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000294 | Grad Max: 0.009618 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022261 | Grad Max: 0.116798 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000439 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004637 | Grad Max: 0.010082 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000257 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001131 | Grad Max: 0.003273 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000810 | Grad Max: 0.002433 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018449 | Grad Max: 0.018449 [GRADIENT NORM TOTAL] 7.7924 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.759 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54389066 0.4561094 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.074 [MASKS] A(Pass/Fail): 685/1363 | B: 650/1398 | C: 555/1493 [LOSS Ex1] A: 0.64082 | B: 0.62205 | C: 0.61862 [LOGITS Ex2 A] Mean Abs: 2.217 | Max: 6.138 [LOSS Ex2] A: 0.12243 | B: 0.34028 | C: 0.21690 ** [JOINT LOSS] ** : 0.853698 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007355 | Grad Max: 0.340115 -> Layer: shared_layers.0.bias | Grad Mean: 0.191857 | Grad Max: 0.734088 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005848 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005498 | Grad Max: 0.005498 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001657 | Grad Max: 0.333595 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025889 | Grad Max: 1.831378 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000121 | Grad Max: 0.008504 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005879 | Grad Max: 0.091006 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000219 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000589 | Grad Max: 0.003144 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000057 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000159 | Grad Max: 0.000730 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000265 | Grad Max: 0.000739 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002610 | Grad Max: 0.002610 [GRADIENT NORM TOTAL] 4.9227 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.950 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.78066885 0.21933112] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.078 [MASKS] A(Pass/Fail): 747/1301 | B: 638/1410 | C: 537/1511 [LOSS Ex1] A: 0.63455 | B: 0.62262 | C: 0.61726 [LOGITS Ex2 A] Mean Abs: 2.269 | Max: 6.548 [LOSS Ex2] A: 0.11223 | B: 0.34179 | C: 0.23740 ** [JOINT LOSS] ** : 0.855283 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004762 | Grad Max: 0.147816 -> Layer: shared_layers.0.bias | Grad Mean: 0.376326 | Grad Max: 1.958760 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.005849 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002718 | Grad Max: 0.002718 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002542 | Grad Max: 0.486189 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046394 | Grad Max: 2.661593 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.012278 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022979 | Grad Max: 0.149557 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000406 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004237 | Grad Max: 0.009814 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000206 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001016 | Grad Max: 0.002945 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000570 | Grad Max: 0.001508 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015067 | Grad Max: 0.015067 [GRADIENT NORM TOTAL] 8.6724 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 1.054 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50047594 0.49952406] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.077 [MASKS] A(Pass/Fail): 719/1329 | B: 639/1409 | C: 590/1458 [LOSS Ex1] A: 0.64144 | B: 0.61819 | C: 0.61213 [LOGITS Ex2 A] Mean Abs: 2.254 | Max: 5.960 [LOSS Ex2] A: 0.10961 | B: 0.31058 | C: 0.23854 ** [JOINT LOSS] ** : 0.843498 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004657 | Grad Max: 0.162262 -> Layer: shared_layers.0.bias | Grad Mean: 0.163264 | Grad Max: 0.772581 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002151 | Grad Max: 0.006117 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001233 | Grad Max: 0.001233 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001124 | Grad Max: 0.529504 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018161 | Grad Max: 2.933545 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.002431 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002860 | Grad Max: 0.022263 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000186 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000570 | Grad Max: 0.002912 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000063 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000138 | Grad Max: 0.000694 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000304 | Grad Max: 0.000933 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002006 | Grad Max: 0.002006 [GRADIENT NORM TOTAL] 5.3078 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.742 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7170845 0.2829155] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.078 [MASKS] A(Pass/Fail): 714/1334 | B: 602/1254 | C: 598/1450 [LOSS Ex1] A: 0.63697 | B: 0.62250 | C: 0.61476 [LOGITS Ex2 A] Mean Abs: 2.214 | Max: 6.455 [LOSS Ex2] A: 0.12937 | B: 0.32338 | C: 0.22349 ** [JOINT LOSS] ** : 0.850158 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003974 | Grad Max: 0.127598 -> Layer: shared_layers.0.bias | Grad Mean: 0.388217 | Grad Max: 1.835605 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006125 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001295 | Grad Max: 0.001295 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002422 | Grad Max: 0.605514 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044039 | Grad Max: 3.387959 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000270 | Grad Max: 0.009023 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021654 | Grad Max: 0.110027 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000343 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004342 | Grad Max: 0.009834 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000232 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001064 | Grad Max: 0.003170 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000635 | Grad Max: 0.001762 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016656 | Grad Max: 0.016656 [GRADIENT NORM TOTAL] 8.6722 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.857 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6305006 0.36949936] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.078 [MASKS] A(Pass/Fail): 589/1027 | B: 650/1398 | C: 558/1490 [LOSS Ex1] A: 0.63517 | B: 0.62185 | C: 0.61701 [LOGITS Ex2 A] Mean Abs: 2.276 | Max: 10.290 [LOSS Ex2] A: 0.11885 | B: 0.33699 | C: 0.24143 ** [JOINT LOSS] ** : 0.857102 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004764 | Grad Max: 0.135540 -> Layer: shared_layers.0.bias | Grad Mean: 0.149681 | Grad Max: 0.765896 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005956 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002787 | Grad Max: 0.002787 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001101 | Grad Max: 0.151988 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018435 | Grad Max: 0.835920 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000064 | Grad Max: 0.004143 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003206 | Grad Max: 0.048397 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000152 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000464 | Grad Max: 0.002437 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000002 | Grad Max: 0.000071 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000133 | Grad Max: 0.000671 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000180 | Grad Max: 0.000608 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003281 | Grad Max: 0.003281 [GRADIENT NORM TOTAL] 3.2094 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.055 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.507633 0.49236706] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.078 [MASKS] A(Pass/Fail): 718/1330 | B: 638/1410 | C: 572/1476 [LOSS Ex1] A: 0.63583 | B: 0.62242 | C: 0.61337 [LOGITS Ex2 A] Mean Abs: 2.279 | Max: 7.833 [LOSS Ex2] A: 0.10507 | B: 0.33251 | C: 0.21231 ** [JOINT LOSS] ** : 0.840503 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007637 | Grad Max: 0.247156 -> Layer: shared_layers.0.bias | Grad Mean: 0.413837 | Grad Max: 2.193250 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.005872 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001592 | Grad Max: 0.001592 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002823 | Grad Max: 0.437951 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051527 | Grad Max: 2.434974 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000329 | Grad Max: 0.011030 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025522 | Grad Max: 0.135162 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000457 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005277 | Grad Max: 0.011653 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000250 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001301 | Grad Max: 0.003416 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000830 | Grad Max: 0.002193 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020436 | Grad Max: 0.020436 [GRADIENT NORM TOTAL] 9.2171 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.996 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.508936 0.49106395] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.078 [MASKS] A(Pass/Fail): 713/1335 | B: 639/1409 | C: 556/1492 [LOSS Ex1] A: 0.63260 | B: 0.61800 | C: 0.61405 [LOGITS Ex2 A] Mean Abs: 2.244 | Max: 6.921 [LOSS Ex2] A: 0.11842 | B: 0.30909 | C: 0.22818 ** [JOINT LOSS] ** : 0.840114 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006240 | Grad Max: 0.191194 -> Layer: shared_layers.0.bias | Grad Mean: 0.298517 | Grad Max: 1.778480 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002349 | Grad Max: 0.005851 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002061 | Grad Max: 0.002061 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002049 | Grad Max: 0.339120 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037026 | Grad Max: 1.894032 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.007368 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017139 | Grad Max: 0.100450 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000343 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003600 | Grad Max: 0.007821 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000188 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000892 | Grad Max: 0.002556 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000554 | Grad Max: 0.001615 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014095 | Grad Max: 0.014095 [GRADIENT NORM TOTAL] 6.9127 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.027 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50661 0.49339002] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.078 [MASKS] A(Pass/Fail): 717/1331 | B: 602/1254 | C: 517/1531 [LOSS Ex1] A: 0.63159 | B: 0.62230 | C: 0.62133 [LOGITS Ex2 A] Mean Abs: 2.189 | Max: 7.298 [LOSS Ex2] A: 0.12330 | B: 0.33132 | C: 0.23693 ** [JOINT LOSS] ** : 0.855591 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004235 | Grad Max: 0.135675 -> Layer: shared_layers.0.bias | Grad Mean: 0.370391 | Grad Max: 2.068271 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002295 | Grad Max: 0.006791 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008732 | Grad Max: 0.008732 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002451 | Grad Max: 0.503122 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044481 | Grad Max: 2.809725 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000306 | Grad Max: 0.010489 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024371 | Grad Max: 0.139562 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000381 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004543 | Grad Max: 0.009598 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000214 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001084 | Grad Max: 0.003023 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000667 | Grad Max: 0.001718 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017072 | Grad Max: 0.017072 [GRADIENT NORM TOTAL] 8.3702 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.848 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50139356 0.49860647] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.077 [MASKS] A(Pass/Fail): 689/1359 | B: 650/1398 | C: 551/1497 [LOSS Ex1] A: 0.64026 | B: 0.62166 | C: 0.61863 [LOGITS Ex2 A] Mean Abs: 2.139 | Max: 6.503 [LOSS Ex2] A: 0.10567 | B: 0.34140 | C: 0.26369 ** [JOINT LOSS] ** : 0.863774 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007571 | Grad Max: 0.176825 -> Layer: shared_layers.0.bias | Grad Mean: 0.418401 | Grad Max: 2.074460 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.006120 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009362 | Grad Max: 0.009362 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002934 | Grad Max: 0.624430 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054114 | Grad Max: 3.519325 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000372 | Grad Max: 0.012522 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029494 | Grad Max: 0.174911 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000467 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006054 | Grad Max: 0.012255 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000247 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001557 | Grad Max: 0.003579 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001118 | Grad Max: 0.002526 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027746 | Grad Max: 0.027746 [GRADIENT NORM TOTAL] 9.4885 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.764 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54400074 0.45599923] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.074 [MASKS] A(Pass/Fail): 686/1362 | B: 638/1410 | C: 569/1479 [LOSS Ex1] A: 0.64060 | B: 0.62224 | C: 0.61598 [LOGITS Ex2 A] Mean Abs: 2.165 | Max: 6.582 [LOSS Ex2] A: 0.11798 | B: 0.33816 | C: 0.24011 ** [JOINT LOSS] ** : 0.858357 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005100 | Grad Max: 0.127134 -> Layer: shared_layers.0.bias | Grad Mean: 0.231408 | Grad Max: 1.278394 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.006428 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011655 | Grad Max: 0.011655 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001470 | Grad Max: 0.435101 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025580 | Grad Max: 2.404567 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.004405 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003630 | Grad Max: 0.037924 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000127 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000538 | Grad Max: 0.002817 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000083 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000112 | Grad Max: 0.000857 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000287 | Grad Max: 0.001151 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000052 | Grad Max: 0.000052 [GRADIENT NORM TOTAL] 5.6420 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.955 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.78205 0.21794997] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.078 [MASKS] A(Pass/Fail): 749/1299 | B: 639/1409 | C: 584/1464 [LOSS Ex1] A: 0.63431 | B: 0.61783 | C: 0.61129 [LOGITS Ex2 A] Mean Abs: 2.225 | Max: 7.405 [LOSS Ex2] A: 0.11355 | B: 0.32480 | C: 0.25043 ** [JOINT LOSS] ** : 0.850733 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003857 | Grad Max: 0.255086 -> Layer: shared_layers.0.bias | Grad Mean: 0.643389 | Grad Max: 3.158346 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002364 | Grad Max: 0.006040 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006500 | Grad Max: 0.006500 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004084 | Grad Max: 0.787372 -> Layer: exit2_layers.0.bias | Grad Mean: 0.075941 | Grad Max: 4.392558 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000456 | Grad Max: 0.018411 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037674 | Grad Max: 0.235381 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000544 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007328 | Grad Max: 0.015448 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000311 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001800 | Grad Max: 0.004874 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000983 | Grad Max: 0.002244 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028225 | Grad Max: 0.028225 [GRADIENT NORM TOTAL] 15.0116 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.059 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500508 0.499492] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.077 [MASKS] A(Pass/Fail): 720/1328 | B: 602/1254 | C: 384/992 [LOSS Ex1] A: 0.64123 | B: 0.62214 | C: 0.61789 [LOGITS Ex2 A] Mean Abs: 2.220 | Max: 5.914 [LOSS Ex2] A: 0.10563 | B: 0.30831 | C: 0.25821 ** [JOINT LOSS] ** : 0.851134 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003774 | Grad Max: 0.107573 -> Layer: shared_layers.0.bias | Grad Mean: 0.358963 | Grad Max: 1.465695 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.005599 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001563 | Grad Max: 0.001563 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002515 | Grad Max: 0.417776 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046133 | Grad Max: 2.321572 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000286 | Grad Max: 0.010567 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023099 | Grad Max: 0.128929 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000382 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004304 | Grad Max: 0.009352 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000173 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001032 | Grad Max: 0.002567 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000576 | Grad Max: 0.001737 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015235 | Grad Max: 0.015235 [GRADIENT NORM TOTAL] 8.4015 [EPOCH SUMMARY] Train Loss: 0.8512 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8325 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 141/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.746 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.71806955 0.28193048] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.078 [MASKS] A(Pass/Fail): 714/1334 | B: 650/1398 | C: 556/1492 [LOSS Ex1] A: 0.63675 | B: 0.62151 | C: 0.62048 [LOGITS Ex2 A] Mean Abs: 2.168 | Max: 7.404 [LOSS Ex2] A: 0.13296 | B: 0.33860 | C: 0.23933 ** [JOINT LOSS] ** : 0.863211 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006865 | Grad Max: 0.165085 -> Layer: shared_layers.0.bias | Grad Mean: 0.422659 | Grad Max: 2.043360 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002060 | Grad Max: 0.005883 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008484 | Grad Max: 0.008484 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002571 | Grad Max: 0.391884 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047330 | Grad Max: 2.057196 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000307 | Grad Max: 0.010217 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024227 | Grad Max: 0.136355 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000468 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004914 | Grad Max: 0.011162 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000231 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001246 | Grad Max: 0.003449 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000852 | Grad Max: 0.001976 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021473 | Grad Max: 0.021473 [GRADIENT NORM TOTAL] 8.5403 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.862 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6309649 0.36903512] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.078 [MASKS] A(Pass/Fail): 589/1027 | B: 638/1410 | C: 534/1514 [LOSS Ex1] A: 0.63495 | B: 0.62210 | C: 0.62380 [LOGITS Ex2 A] Mean Abs: 2.210 | Max: 7.140 [LOSS Ex2] A: 0.10865 | B: 0.33398 | C: 0.23800 ** [JOINT LOSS] ** : 0.853824 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006106 | Grad Max: 0.210175 -> Layer: shared_layers.0.bias | Grad Mean: 0.543747 | Grad Max: 2.841476 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.006046 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004787 | Grad Max: 0.004787 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003296 | Grad Max: 0.789017 -> Layer: exit2_layers.0.bias | Grad Mean: 0.061478 | Grad Max: 4.366045 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000371 | Grad Max: 0.014452 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029746 | Grad Max: 0.192105 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000543 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005904 | Grad Max: 0.013508 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000269 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001491 | Grad Max: 0.003896 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001008 | Grad Max: 0.002476 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025858 | Grad Max: 0.025858 [GRADIENT NORM TOTAL] 12.6216 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.061 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50765634 0.4923436 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.079 [MASKS] A(Pass/Fail): 718/1330 | B: 639/1409 | C: 579/1469 [LOSS Ex1] A: 0.63562 | B: 0.61769 | C: 0.61272 [LOGITS Ex2 A] Mean Abs: 2.225 | Max: 7.728 [LOSS Ex2] A: 0.10557 | B: 0.31006 | C: 0.22311 ** [JOINT LOSS] ** : 0.834920 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002296 | Grad Max: 0.058193 -> Layer: shared_layers.0.bias | Grad Mean: 0.145778 | Grad Max: 0.649364 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002278 | Grad Max: 0.005523 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003038 | Grad Max: 0.003038 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000879 | Grad Max: 0.531998 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015332 | Grad Max: 2.950930 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.002626 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002311 | Grad Max: 0.018051 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000156 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000388 | Grad Max: 0.002882 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000068 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000108 | Grad Max: 0.000773 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000420 | Grad Max: 0.001182 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000577 | Grad Max: 0.000577 [GRADIENT NORM TOTAL] 5.0357 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.001 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5088294 0.4911706] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.078 [MASKS] A(Pass/Fail): 713/1335 | B: 602/1254 | C: 566/1482 [LOSS Ex1] A: 0.63238 | B: 0.62200 | C: 0.61461 [LOGITS Ex2 A] Mean Abs: 2.242 | Max: 6.243 [LOSS Ex2] A: 0.11587 | B: 0.31362 | C: 0.23751 ** [JOINT LOSS] ** : 0.845329 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006509 | Grad Max: 0.175861 -> Layer: shared_layers.0.bias | Grad Mean: 0.429425 | Grad Max: 2.069848 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002250 | Grad Max: 0.006205 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001784 | Grad Max: 0.001784 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002889 | Grad Max: 0.474126 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052524 | Grad Max: 2.664815 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000350 | Grad Max: 0.014731 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027788 | Grad Max: 0.152766 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000419 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005681 | Grad Max: 0.011556 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000233 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001452 | Grad Max: 0.003692 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000902 | Grad Max: 0.002170 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024165 | Grad Max: 0.024165 [GRADIENT NORM TOTAL] 9.3701 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.032 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067053 0.49329472] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.078 [MASKS] A(Pass/Fail): 717/1331 | B: 650/1398 | C: 558/1490 [LOSS Ex1] A: 0.63138 | B: 0.62138 | C: 0.61338 [LOGITS Ex2 A] Mean Abs: 2.216 | Max: 6.944 [LOSS Ex2] A: 0.12910 | B: 0.32976 | C: 0.23933 ** [JOINT LOSS] ** : 0.854778 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007923 | Grad Max: 0.246288 -> Layer: shared_layers.0.bias | Grad Mean: 0.360676 | Grad Max: 1.665068 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002290 | Grad Max: 0.006130 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006900 | Grad Max: 0.006900 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002587 | Grad Max: 0.480935 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046556 | Grad Max: 2.687396 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000329 | Grad Max: 0.012096 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025674 | Grad Max: 0.126939 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000453 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005365 | Grad Max: 0.011309 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000270 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001309 | Grad Max: 0.003931 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000742 | Grad Max: 0.002091 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019436 | Grad Max: 0.019436 [GRADIENT NORM TOTAL] 8.0701 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.852 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50141203 0.49858797] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.077 [MASKS] A(Pass/Fail): 689/1359 | B: 638/1410 | C: 535/1513 [LOSS Ex1] A: 0.64006 | B: 0.62197 | C: 0.61956 [LOGITS Ex2 A] Mean Abs: 2.137 | Max: 6.747 [LOSS Ex2] A: 0.10880 | B: 0.33890 | C: 0.24637 ** [JOINT LOSS] ** : 0.858554 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003791 | Grad Max: 0.179492 -> Layer: shared_layers.0.bias | Grad Mean: 0.462530 | Grad Max: 2.464746 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002030 | Grad Max: 0.005616 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006878 | Grad Max: 0.006878 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003084 | Grad Max: 0.618228 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057700 | Grad Max: 3.477134 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000362 | Grad Max: 0.014352 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029584 | Grad Max: 0.160496 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000478 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005813 | Grad Max: 0.012739 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000249 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001424 | Grad Max: 0.003831 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000897 | Grad Max: 0.002111 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023576 | Grad Max: 0.023576 [GRADIENT NORM TOTAL] 10.7159 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.127 | Max: 0.767 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438761 0.45612392] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.074 [MASKS] A(Pass/Fail): 686/1362 | B: 639/1409 | C: 562/1486 [LOSS Ex1] A: 0.64041 | B: 0.61756 | C: 0.61368 [LOGITS Ex2 A] Mean Abs: 2.100 | Max: 6.623 [LOSS Ex2] A: 0.11936 | B: 0.31683 | C: 0.22766 ** [JOINT LOSS] ** : 0.845170 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004313 | Grad Max: 0.167274 -> Layer: shared_layers.0.bias | Grad Mean: 0.494722 | Grad Max: 2.357935 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.005347 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003307 | Grad Max: 0.003307 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003209 | Grad Max: 0.590102 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059836 | Grad Max: 3.304147 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.014037 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031471 | Grad Max: 0.176374 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000574 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006190 | Grad Max: 0.015227 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000249 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001500 | Grad Max: 0.004151 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000850 | Grad Max: 0.002246 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023187 | Grad Max: 0.023187 [GRADIENT NORM TOTAL] 11.0444 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.960 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.782946 0.21705402] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.078 [MASKS] A(Pass/Fail): 747/1301 | B: 602/1254 | C: 568/1480 [LOSS Ex1] A: 0.63412 | B: 0.62187 | C: 0.61537 [LOGITS Ex2 A] Mean Abs: 2.169 | Max: 6.466 [LOSS Ex2] A: 0.10306 | B: 0.31667 | C: 0.23479 ** [JOINT LOSS] ** : 0.841963 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002459 | Grad Max: 0.089490 -> Layer: shared_layers.0.bias | Grad Mean: 0.100319 | Grad Max: 0.615362 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002295 | Grad Max: 0.006046 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009890 | Grad Max: 0.009890 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000867 | Grad Max: 0.203104 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014962 | Grad Max: 1.123085 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003092 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002910 | Grad Max: 0.021571 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000155 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000521 | Grad Max: 0.003843 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000082 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000121 | Grad Max: 0.000906 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000363 | Grad Max: 0.001164 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001485 | Grad Max: 0.001485 [GRADIENT NORM TOTAL] 3.0676 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 1.064 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004384 0.49956158] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.077 [MASKS] A(Pass/Fail): 720/1328 | B: 650/1398 | C: 550/1498 [LOSS Ex1] A: 0.64105 | B: 0.62125 | C: 0.61981 [LOGITS Ex2 A] Mean Abs: 2.219 | Max: 5.949 [LOSS Ex2] A: 0.10965 | B: 0.32654 | C: 0.22761 ** [JOINT LOSS] ** : 0.848642 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004757 | Grad Max: 0.236828 -> Layer: shared_layers.0.bias | Grad Mean: 0.540930 | Grad Max: 2.712476 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.005894 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004723 | Grad Max: 0.004723 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003303 | Grad Max: 0.584947 -> Layer: exit2_layers.0.bias | Grad Mean: 0.061773 | Grad Max: 3.252486 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.014760 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031442 | Grad Max: 0.177671 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000460 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006192 | Grad Max: 0.012402 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000252 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001570 | Grad Max: 0.004029 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000906 | Grad Max: 0.002369 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025597 | Grad Max: 0.025597 [GRADIENT NORM TOTAL] 11.9135 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.750 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.718693 0.28130698] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.078 [MASKS] A(Pass/Fail): 714/1334 | B: 638/1410 | C: 577/1471 [LOSS Ex1] A: 0.63656 | B: 0.62183 | C: 0.61545 [LOGITS Ex2 A] Mean Abs: 2.205 | Max: 6.243 [LOSS Ex2] A: 0.12654 | B: 0.33337 | C: 0.22883 ** [JOINT LOSS] ** : 0.854198 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003562 | Grad Max: 0.154519 -> Layer: shared_layers.0.bias | Grad Mean: 0.458187 | Grad Max: 2.116078 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.006193 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001208 | Grad Max: 0.001208 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003025 | Grad Max: 0.507045 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055470 | Grad Max: 2.814749 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000323 | Grad Max: 0.013262 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026114 | Grad Max: 0.142583 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000438 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004974 | Grad Max: 0.010662 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000232 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001221 | Grad Max: 0.003463 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000662 | Grad Max: 0.002159 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019025 | Grad Max: 0.019025 [GRADIENT NORM TOTAL] 10.7985 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.866 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6312126 0.36878747] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.078 [MASKS] A(Pass/Fail): 590/1026 | B: 639/1409 | C: 580/1468 [LOSS Ex1] A: 0.63477 | B: 0.61742 | C: 0.61582 [LOGITS Ex2 A] Mean Abs: 2.220 | Max: 9.098 [LOSS Ex2] A: 0.10963 | B: 0.30616 | C: 0.23913 ** [JOINT LOSS] ** : 0.840976 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002982 | Grad Max: 0.077644 -> Layer: shared_layers.0.bias | Grad Mean: 0.182223 | Grad Max: 0.822475 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002247 | Grad Max: 0.006321 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009482 | Grad Max: 0.009482 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001139 | Grad Max: 0.613079 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020449 | Grad Max: 3.409163 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.004966 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008384 | Grad Max: 0.050990 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001775 | Grad Max: 0.004951 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000126 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000448 | Grad Max: 0.001676 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000398 | Grad Max: 0.001676 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008408 | Grad Max: 0.008408 [GRADIENT NORM TOTAL] 5.3798 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.065 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076763 0.49232367] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.079 [MASKS] A(Pass/Fail): 718/1330 | B: 602/1254 | C: 573/1475 [LOSS Ex1] A: 0.63544 | B: 0.62172 | C: 0.61562 [LOGITS Ex2 A] Mean Abs: 2.180 | Max: 9.027 [LOSS Ex2] A: 0.10307 | B: 0.31068 | C: 0.24046 ** [JOINT LOSS] ** : 0.842331 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004214 | Grad Max: 0.154785 -> Layer: shared_layers.0.bias | Grad Mean: 0.215698 | Grad Max: 0.996888 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.006121 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001610 | Grad Max: 0.001610 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001527 | Grad Max: 0.206105 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027061 | Grad Max: 1.144174 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000171 | Grad Max: 0.006426 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013730 | Grad Max: 0.068284 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000263 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002796 | Grad Max: 0.006502 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000165 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000690 | Grad Max: 0.002296 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000437 | Grad Max: 0.001662 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010858 | Grad Max: 0.010858 [GRADIENT NORM TOTAL] 4.7817 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.005 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5087766 0.49122337] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.079 [MASKS] A(Pass/Fail): 714/1334 | B: 650/1398 | C: 579/1469 [LOSS Ex1] A: 0.63220 | B: 0.62111 | C: 0.61155 [LOGITS Ex2 A] Mean Abs: 2.199 | Max: 6.869 [LOSS Ex2] A: 0.11405 | B: 0.33381 | C: 0.21532 ** [JOINT LOSS] ** : 0.842684 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002519 | Grad Max: 0.067958 -> Layer: shared_layers.0.bias | Grad Mean: 0.153370 | Grad Max: 0.828845 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002233 | Grad Max: 0.006448 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000788 | Grad Max: 0.000788 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000991 | Grad Max: 0.207923 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017804 | Grad Max: 1.163873 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000093 | Grad Max: 0.004858 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007227 | Grad Max: 0.053119 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000197 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001476 | Grad Max: 0.004538 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000094 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000369 | Grad Max: 0.001174 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000385 | Grad Max: 0.001134 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005902 | Grad Max: 0.005902 [GRADIENT NORM TOTAL] 3.4136 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.036 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50676334 0.49323666] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.078 [MASKS] A(Pass/Fail): 717/1331 | B: 638/1410 | C: 393/983 [LOSS Ex1] A: 0.63120 | B: 0.62170 | C: 0.61239 [LOGITS Ex2 A] Mean Abs: 2.174 | Max: 7.632 [LOSS Ex2] A: 0.12193 | B: 0.32388 | C: 0.21333 ** [JOINT LOSS] ** : 0.841473 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003471 | Grad Max: 0.143627 -> Layer: shared_layers.0.bias | Grad Mean: 0.133953 | Grad Max: 0.808312 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002296 | Grad Max: 0.006282 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002417 | Grad Max: 0.002417 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001256 | Grad Max: 0.274503 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022165 | Grad Max: 1.499372 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.005547 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009612 | Grad Max: 0.058210 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000228 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002045 | Grad Max: 0.005321 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000125 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000514 | Grad Max: 0.001557 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000451 | Grad Max: 0.001465 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008085 | Grad Max: 0.008085 [GRADIENT NORM TOTAL] 3.8797 [EPOCH SUMMARY] Train Loss: 0.8477 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8320 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 142/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.856 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013791 0.49862093] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.077 [MASKS] A(Pass/Fail): 689/1359 | B: 639/1409 | C: 576/1472 [LOSS Ex1] A: 0.63989 | B: 0.61728 | C: 0.61280 [LOGITS Ex2 A] Mean Abs: 2.110 | Max: 5.922 [LOSS Ex2] A: 0.11990 | B: 0.31578 | C: 0.22700 ** [JOINT LOSS] ** : 0.844212 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004605 | Grad Max: 0.163125 -> Layer: shared_layers.0.bias | Grad Mean: 0.437420 | Grad Max: 2.238805 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002253 | Grad Max: 0.006579 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010834 | Grad Max: 0.010834 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002806 | Grad Max: 0.613294 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052376 | Grad Max: 3.447569 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.011679 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026874 | Grad Max: 0.147479 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000470 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005277 | Grad Max: 0.010988 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000246 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001304 | Grad Max: 0.003897 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000820 | Grad Max: 0.002257 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021387 | Grad Max: 0.021387 [GRADIENT NORM TOTAL] 10.1440 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.771 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439282 0.4560718] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.074 [MASKS] A(Pass/Fail): 686/1362 | B: 603/1253 | C: 594/1454 [LOSS Ex1] A: 0.64024 | B: 0.62158 | C: 0.61316 [LOGITS Ex2 A] Mean Abs: 2.097 | Max: 6.391 [LOSS Ex2] A: 0.11736 | B: 0.31145 | C: 0.25850 ** [JOINT LOSS] ** : 0.854097 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005276 | Grad Max: 0.146606 -> Layer: shared_layers.0.bias | Grad Mean: 0.397915 | Grad Max: 1.930753 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.005909 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007481 | Grad Max: 0.007481 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002410 | Grad Max: 0.404397 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044924 | Grad Max: 2.257474 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000300 | Grad Max: 0.011910 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024172 | Grad Max: 0.135550 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000394 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004766 | Grad Max: 0.009959 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000246 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001184 | Grad Max: 0.003546 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000768 | Grad Max: 0.002345 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019362 | Grad Max: 0.019362 [GRADIENT NORM TOTAL] 8.2140 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.963 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7839656 0.21603446] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.079 [MASKS] A(Pass/Fail): 750/1298 | B: 650/1398 | C: 549/1499 [LOSS Ex1] A: 0.63394 | B: 0.62097 | C: 0.61820 [LOGITS Ex2 A] Mean Abs: 2.189 | Max: 7.175 [LOSS Ex2] A: 0.12015 | B: 0.33299 | C: 0.23042 ** [JOINT LOSS] ** : 0.852223 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004430 | Grad Max: 0.118544 -> Layer: shared_layers.0.bias | Grad Mean: 0.261972 | Grad Max: 1.234533 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.006076 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002405 | Grad Max: 0.002405 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001958 | Grad Max: 0.243782 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035293 | Grad Max: 1.366867 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.009475 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017768 | Grad Max: 0.096934 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000321 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003686 | Grad Max: 0.007894 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000178 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000923 | Grad Max: 0.002538 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000599 | Grad Max: 0.001586 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014989 | Grad Max: 0.014989 [GRADIENT NORM TOTAL] 5.9790 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.068 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50043523 0.49956477] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.077 [MASKS] A(Pass/Fail): 720/1328 | B: 638/1410 | C: 533/1515 [LOSS Ex1] A: 0.64088 | B: 0.62156 | C: 0.61858 [LOGITS Ex2 A] Mean Abs: 2.194 | Max: 6.294 [LOSS Ex2] A: 0.10162 | B: 0.33019 | C: 0.22394 ** [JOINT LOSS] ** : 0.845591 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003047 | Grad Max: 0.096265 -> Layer: shared_layers.0.bias | Grad Mean: 0.274828 | Grad Max: 1.310618 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.005182 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001005 | Grad Max: 0.001005 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002019 | Grad Max: 0.391989 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036770 | Grad Max: 2.206040 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000201 | Grad Max: 0.009127 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016534 | Grad Max: 0.112661 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000307 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003129 | Grad Max: 0.007497 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000150 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000759 | Grad Max: 0.002277 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000445 | Grad Max: 0.001478 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010597 | Grad Max: 0.010597 [GRADIENT NORM TOTAL] 6.9062 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.753 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.71949035 0.28050962] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.079 [MASKS] A(Pass/Fail): 714/1334 | B: 639/1409 | C: 571/1477 [LOSS Ex1] A: 0.63638 | B: 0.61714 | C: 0.61480 [LOGITS Ex2 A] Mean Abs: 2.153 | Max: 6.647 [LOSS Ex2] A: 0.12283 | B: 0.32187 | C: 0.25100 ** [JOINT LOSS] ** : 0.854671 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005473 | Grad Max: 0.147062 -> Layer: shared_layers.0.bias | Grad Mean: 0.256686 | Grad Max: 1.213346 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.005580 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002524 | Grad Max: 0.002524 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001755 | Grad Max: 0.195491 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031132 | Grad Max: 1.093815 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000199 | Grad Max: 0.007040 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015669 | Grad Max: 0.087480 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000315 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003355 | Grad Max: 0.007280 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000238 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000835 | Grad Max: 0.002638 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000625 | Grad Max: 0.001667 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014421 | Grad Max: 0.014421 [GRADIENT NORM TOTAL] 5.3332 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.870 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63159364 0.3684064 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.079 [MASKS] A(Pass/Fail): 590/1026 | B: 605/1251 | C: 554/1494 [LOSS Ex1] A: 0.63457 | B: 0.62143 | C: 0.61584 [LOGITS Ex2 A] Mean Abs: 2.203 | Max: 7.458 [LOSS Ex2] A: 0.10794 | B: 0.31738 | C: 0.22993 ** [JOINT LOSS] ** : 0.842364 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002360 | Grad Max: 0.066393 -> Layer: shared_layers.0.bias | Grad Mean: 0.156373 | Grad Max: 0.986257 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.005952 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002658 | Grad Max: 0.002658 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001158 | Grad Max: 0.174153 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021044 | Grad Max: 0.971357 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.005631 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010326 | Grad Max: 0.075048 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000218 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002011 | Grad Max: 0.005540 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000133 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000502 | Grad Max: 0.001633 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000368 | Grad Max: 0.001525 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008131 | Grad Max: 0.008131 [GRADIENT NORM TOTAL] 3.7225 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.070 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076785 0.49232146] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.079 [MASKS] A(Pass/Fail): 718/1330 | B: 650/1398 | C: 561/1487 [LOSS Ex1] A: 0.63525 | B: 0.62084 | C: 0.61526 [LOGITS Ex2 A] Mean Abs: 2.233 | Max: 9.441 [LOSS Ex2] A: 0.11615 | B: 0.33384 | C: 0.24046 ** [JOINT LOSS] ** : 0.853930 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006371 | Grad Max: 0.206721 -> Layer: shared_layers.0.bias | Grad Mean: 0.517941 | Grad Max: 2.666577 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.005619 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005203 | Grad Max: 0.005203 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003194 | Grad Max: 0.539667 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058850 | Grad Max: 3.020158 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000375 | Grad Max: 0.012500 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030328 | Grad Max: 0.153466 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000536 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006041 | Grad Max: 0.012772 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000260 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001456 | Grad Max: 0.003973 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000806 | Grad Max: 0.002027 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021565 | Grad Max: 0.021565 [GRADIENT NORM TOTAL] 11.1108 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.009 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50868225 0.49131778] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.079 [MASKS] A(Pass/Fail): 715/1333 | B: 638/1410 | C: 558/1490 [LOSS Ex1] A: 0.63200 | B: 0.62142 | C: 0.61598 [LOGITS Ex2 A] Mean Abs: 2.229 | Max: 6.214 [LOSS Ex2] A: 0.11383 | B: 0.32653 | C: 0.21548 ** [JOINT LOSS] ** : 0.841742 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008268 | Grad Max: 0.231180 -> Layer: shared_layers.0.bias | Grad Mean: 0.603621 | Grad Max: 3.087987 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.006657 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001330 | Grad Max: 0.001330 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003835 | Grad Max: 0.602944 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070376 | Grad Max: 3.392644 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000438 | Grad Max: 0.014064 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035234 | Grad Max: 0.169086 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000599 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007170 | Grad Max: 0.015145 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000283 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001779 | Grad Max: 0.004537 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001042 | Grad Max: 0.002704 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027573 | Grad Max: 0.027573 [GRADIENT NORM TOTAL] 13.0451 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.040 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068745 0.4931255] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.078 [MASKS] A(Pass/Fail): 717/1331 | B: 639/1409 | C: 572/1476 [LOSS Ex1] A: 0.63099 | B: 0.61700 | C: 0.61891 [LOGITS Ex2 A] Mean Abs: 2.181 | Max: 7.171 [LOSS Ex2] A: 0.13182 | B: 0.31576 | C: 0.22584 ** [JOINT LOSS] ** : 0.846774 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005702 | Grad Max: 0.176677 -> Layer: shared_layers.0.bias | Grad Mean: 0.185105 | Grad Max: 0.663919 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002236 | Grad Max: 0.006191 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001186 | Grad Max: 0.001186 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001368 | Grad Max: 0.342803 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023888 | Grad Max: 1.845327 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000121 | Grad Max: 0.004498 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008839 | Grad Max: 0.045928 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000267 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001985 | Grad Max: 0.005157 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000111 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000486 | Grad Max: 0.001337 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000326 | Grad Max: 0.001130 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007098 | Grad Max: 0.007098 [GRADIENT NORM TOTAL] 4.6348 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.860 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013945 0.49860546] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.558 | Std: 0.077 [MASKS] A(Pass/Fail): 689/1359 | B: 606/1250 | C: 553/1495 [LOSS Ex1] A: 0.63970 | B: 0.62129 | C: 0.61370 [LOGITS Ex2 A] Mean Abs: 2.114 | Max: 6.153 [LOSS Ex2] A: 0.10954 | B: 0.33116 | C: 0.21961 ** [JOINT LOSS] ** : 0.844997 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006447 | Grad Max: 0.236609 -> Layer: shared_layers.0.bias | Grad Mean: 0.712238 | Grad Max: 3.263726 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.005917 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002616 | Grad Max: 0.002616 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004623 | Grad Max: 0.796183 -> Layer: exit2_layers.0.bias | Grad Mean: 0.086576 | Grad Max: 4.436524 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000559 | Grad Max: 0.018500 -> Layer: exit2_layers.3.bias | Grad Mean: 0.045859 | Grad Max: 0.235296 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000730 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009079 | Grad Max: 0.019840 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000382 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002222 | Grad Max: 0.005982 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001365 | Grad Max: 0.003082 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035083 | Grad Max: 0.035083 [GRADIENT NORM TOTAL] 15.9160 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.775 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439716 0.4560284] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.557 | Std: 0.075 [MASKS] A(Pass/Fail): 686/1362 | B: 650/1398 | C: 569/1479 [LOSS Ex1] A: 0.64006 | B: 0.62071 | C: 0.61231 [LOGITS Ex2 A] Mean Abs: 2.090 | Max: 6.797 [LOSS Ex2] A: 0.12291 | B: 0.36269 | C: 0.21764 ** [JOINT LOSS] ** : 0.858773 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011071 | Grad Max: 0.309662 -> Layer: shared_layers.0.bias | Grad Mean: 0.924026 | Grad Max: 4.098980 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.006230 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010284 | Grad Max: 0.010284 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005982 | Grad Max: 0.939016 -> Layer: exit2_layers.0.bias | Grad Mean: 0.111283 | Grad Max: 5.225818 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000742 | Grad Max: 0.023866 -> Layer: exit2_layers.3.bias | Grad Mean: 0.060065 | Grad Max: 0.301664 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.000937 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011993 | Grad Max: 0.023838 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000532 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002967 | Grad Max: 0.008007 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001845 | Grad Max: 0.003990 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047571 | Grad Max: 0.047571 [GRADIENT NORM TOTAL] 20.0928 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.968 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.78499407 0.21500598] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.079 [MASKS] A(Pass/Fail): 750/1298 | B: 638/1410 | C: 618/1430 [LOSS Ex1] A: 0.63375 | B: 0.62129 | C: 0.61183 [LOGITS Ex2 A] Mean Abs: 2.168 | Max: 7.094 [LOSS Ex2] A: 0.10004 | B: 0.34698 | C: 0.22106 ** [JOINT LOSS] ** : 0.844981 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005050 | Grad Max: 0.145668 -> Layer: shared_layers.0.bias | Grad Mean: 0.424204 | Grad Max: 1.898108 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002259 | Grad Max: 0.006464 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004523 | Grad Max: 0.004523 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002581 | Grad Max: 0.465232 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048031 | Grad Max: 2.605263 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000322 | Grad Max: 0.010513 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025793 | Grad Max: 0.133067 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000427 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005110 | Grad Max: 0.010466 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000248 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001264 | Grad Max: 0.003704 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000837 | Grad Max: 0.002452 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020655 | Grad Max: 0.020655 [GRADIENT NORM TOTAL] 9.1085 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50042045 0.49957955] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.078 [MASKS] A(Pass/Fail): 721/1327 | B: 639/1409 | C: 553/1495 [LOSS Ex1] A: 0.64071 | B: 0.61687 | C: 0.61773 [LOGITS Ex2 A] Mean Abs: 2.249 | Max: 6.057 [LOSS Ex2] A: 0.10269 | B: 0.31314 | C: 0.23997 ** [JOINT LOSS] ** : 0.843704 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004798 | Grad Max: 0.222319 -> Layer: shared_layers.0.bias | Grad Mean: 0.567043 | Grad Max: 2.914605 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002113 | Grad Max: 0.005308 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001163 | Grad Max: 0.001163 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003715 | Grad Max: 0.581839 -> Layer: exit2_layers.0.bias | Grad Mean: 0.068620 | Grad Max: 3.259509 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000448 | Grad Max: 0.015164 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036765 | Grad Max: 0.198725 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000539 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007047 | Grad Max: 0.014896 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000302 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001735 | Grad Max: 0.004813 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000967 | Grad Max: 0.002455 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026791 | Grad Max: 0.026791 [GRADIENT NORM TOTAL] 12.7368 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.757 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7202555 0.2797445] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.079 [MASKS] A(Pass/Fail): 714/1334 | B: 606/1250 | C: 367/1009 [LOSS Ex1] A: 0.63619 | B: 0.62116 | C: 0.62187 [LOGITS Ex2 A] Mean Abs: 2.222 | Max: 5.988 [LOSS Ex2] A: 0.13131 | B: 0.32760 | C: 0.25425 ** [JOINT LOSS] ** : 0.864129 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006352 | Grad Max: 0.340114 -> Layer: shared_layers.0.bias | Grad Mean: 0.873241 | Grad Max: 4.366911 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.006105 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003222 | Grad Max: 0.003222 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005678 | Grad Max: 0.835491 -> Layer: exit2_layers.0.bias | Grad Mean: 0.105980 | Grad Max: 4.625298 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000686 | Grad Max: 0.024130 -> Layer: exit2_layers.3.bias | Grad Mean: 0.056781 | Grad Max: 0.304595 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000840 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011166 | Grad Max: 0.022583 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000451 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002797 | Grad Max: 0.007192 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001700 | Grad Max: 0.003355 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045279 | Grad Max: 0.045279 [GRADIENT NORM TOTAL] 19.4901 [EPOCH SUMMARY] Train Loss: 0.8494 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8340 | Alpha: 0.5500 No improve count: 3/15 ############################## EPOCH 143/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.874 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6319893 0.36801073] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.079 [MASKS] A(Pass/Fail): 590/1026 | B: 650/1398 | C: 579/1469 [LOSS Ex1] A: 0.63439 | B: 0.62059 | C: 0.61325 [LOGITS Ex2 A] Mean Abs: 2.263 | Max: 9.470 [LOSS Ex2] A: 0.12027 | B: 0.32680 | C: 0.22692 ** [JOINT LOSS] ** : 0.847402 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003878 | Grad Max: 0.196128 -> Layer: shared_layers.0.bias | Grad Mean: 0.447137 | Grad Max: 2.435615 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002154 | Grad Max: 0.006066 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006185 | Grad Max: 0.006185 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002873 | Grad Max: 0.453488 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053203 | Grad Max: 2.487609 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000349 | Grad Max: 0.012325 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028569 | Grad Max: 0.153748 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000492 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005482 | Grad Max: 0.011800 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000250 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001372 | Grad Max: 0.003413 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000792 | Grad Max: 0.002064 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021737 | Grad Max: 0.021737 [GRADIENT NORM TOTAL] 9.9824 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.074 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50767803 0.49232197] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 719/1329 | B: 638/1410 | C: 576/1472 [LOSS Ex1] A: 0.63508 | B: 0.62117 | C: 0.61345 [LOGITS Ex2 A] Mean Abs: 2.191 | Max: 7.661 [LOSS Ex2] A: 0.10531 | B: 0.33198 | C: 0.23288 ** [JOINT LOSS] ** : 0.846622 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005782 | Grad Max: 0.235009 -> Layer: shared_layers.0.bias | Grad Mean: 0.290568 | Grad Max: 1.067762 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.005503 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000675 | Grad Max: 0.000675 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001888 | Grad Max: 0.285805 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033597 | Grad Max: 1.484036 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000224 | Grad Max: 0.008120 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017673 | Grad Max: 0.110226 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000364 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003681 | Grad Max: 0.008367 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000215 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000913 | Grad Max: 0.002593 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000605 | Grad Max: 0.001889 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015167 | Grad Max: 0.015167 [GRADIENT NORM TOTAL] 5.8060 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.013 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5087008 0.49129924] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.079 [MASKS] A(Pass/Fail): 716/1332 | B: 639/1409 | C: 594/1454 [LOSS Ex1] A: 0.63182 | B: 0.61675 | C: 0.61213 [LOGITS Ex2 A] Mean Abs: 2.180 | Max: 6.047 [LOSS Ex2] A: 0.11296 | B: 0.31747 | C: 0.22495 ** [JOINT LOSS] ** : 0.838696 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004099 | Grad Max: 0.125846 -> Layer: shared_layers.0.bias | Grad Mean: 0.364704 | Grad Max: 1.696615 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002338 | Grad Max: 0.006665 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005842 | Grad Max: 0.005842 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002373 | Grad Max: 0.296586 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043667 | Grad Max: 1.633076 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000265 | Grad Max: 0.009177 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021793 | Grad Max: 0.119705 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000426 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004340 | Grad Max: 0.009781 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000218 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001057 | Grad Max: 0.003143 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000638 | Grad Max: 0.001925 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016848 | Grad Max: 0.016848 [GRADIENT NORM TOTAL] 7.7406 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.044 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50685906 0.4931409 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.079 [MASKS] A(Pass/Fail): 717/1331 | B: 606/1250 | C: 583/1465 [LOSS Ex1] A: 0.63082 | B: 0.62104 | C: 0.61128 [LOGITS Ex2 A] Mean Abs: 2.173 | Max: 6.900 [LOSS Ex2] A: 0.12290 | B: 0.31469 | C: 0.23929 ** [JOINT LOSS] ** : 0.846676 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004037 | Grad Max: 0.161669 -> Layer: shared_layers.0.bias | Grad Mean: 0.082230 | Grad Max: 0.312750 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002243 | Grad Max: 0.006382 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000799 | Grad Max: 0.000799 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000893 | Grad Max: 0.143716 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013901 | Grad Max: 0.779484 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.004542 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002646 | Grad Max: 0.040794 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000170 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000356 | Grad Max: 0.002433 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000060 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000095 | Grad Max: 0.000727 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000257 | Grad Max: 0.000895 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002208 | Grad Max: 0.002208 [GRADIENT NORM TOTAL] 2.7070 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.863 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50129867 0.49870133] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.078 [MASKS] A(Pass/Fail): 689/1359 | B: 650/1398 | C: 545/1503 [LOSS Ex1] A: 0.63955 | B: 0.62047 | C: 0.62618 [LOGITS Ex2 A] Mean Abs: 2.176 | Max: 5.621 [LOSS Ex2] A: 0.10985 | B: 0.32340 | C: 0.24922 ** [JOINT LOSS] ** : 0.856226 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005175 | Grad Max: 0.136092 -> Layer: shared_layers.0.bias | Grad Mean: 0.410312 | Grad Max: 1.532676 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001955 | Grad Max: 0.005621 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008732 | Grad Max: 0.008732 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002587 | Grad Max: 0.353919 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048161 | Grad Max: 1.965453 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000320 | Grad Max: 0.013402 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026180 | Grad Max: 0.161150 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000468 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005219 | Grad Max: 0.011552 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000259 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001266 | Grad Max: 0.003825 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000721 | Grad Max: 0.001988 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018919 | Grad Max: 0.018919 [GRADIENT NORM TOTAL] 8.3000 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.128 | Max: 0.778 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5440374 0.4559626] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.075 [MASKS] A(Pass/Fail): 686/1362 | B: 639/1409 | C: 550/1498 [LOSS Ex1] A: 0.63992 | B: 0.62105 | C: 0.61321 [LOGITS Ex2 A] Mean Abs: 2.143 | Max: 6.034 [LOSS Ex2] A: 0.12260 | B: 0.33050 | C: 0.24092 ** [JOINT LOSS] ** : 0.856065 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002784 | Grad Max: 0.069253 -> Layer: shared_layers.0.bias | Grad Mean: 0.189135 | Grad Max: 0.756457 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002109 | Grad Max: 0.005503 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006228 | Grad Max: 0.006228 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001560 | Grad Max: 0.351085 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028347 | Grad Max: 1.983019 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000150 | Grad Max: 0.007668 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012186 | Grad Max: 0.076808 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000244 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002294 | Grad Max: 0.005894 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000121 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000570 | Grad Max: 0.001746 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000310 | Grad Max: 0.001181 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008140 | Grad Max: 0.008140 [GRADIENT NORM TOTAL] 5.3251 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.971 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.78595364 0.21404637] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.079 [MASKS] A(Pass/Fail): 751/1297 | B: 639/1409 | C: 568/1480 [LOSS Ex1] A: 0.63358 | B: 0.61663 | C: 0.61497 [LOGITS Ex2 A] Mean Abs: 2.139 | Max: 5.629 [LOSS Ex2] A: 0.11316 | B: 0.31185 | C: 0.20976 ** [JOINT LOSS] ** : 0.833317 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005037 | Grad Max: 0.159678 -> Layer: shared_layers.0.bias | Grad Mean: 0.400216 | Grad Max: 2.322187 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.005988 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004832 | Grad Max: 0.004832 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002536 | Grad Max: 0.620209 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046682 | Grad Max: 3.440555 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000272 | Grad Max: 0.008869 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022164 | Grad Max: 0.108927 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000435 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004515 | Grad Max: 0.010206 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000241 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001103 | Grad Max: 0.003354 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000663 | Grad Max: 0.002364 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016859 | Grad Max: 0.016859 [GRADIENT NORM TOTAL] 9.7567 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 1.077 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004798 0.49952024] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.078 [MASKS] A(Pass/Fail): 721/1327 | B: 606/1250 | C: 566/1482 [LOSS Ex1] A: 0.64056 | B: 0.62091 | C: 0.61646 [LOGITS Ex2 A] Mean Abs: 2.135 | Max: 6.095 [LOSS Ex2] A: 0.10156 | B: 0.31844 | C: 0.22864 ** [JOINT LOSS] ** : 0.842193 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006192 | Grad Max: 0.170674 -> Layer: shared_layers.0.bias | Grad Mean: 0.486411 | Grad Max: 2.204992 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.005651 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003739 | Grad Max: 0.003739 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003047 | Grad Max: 0.702339 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055576 | Grad Max: 3.922605 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000352 | Grad Max: 0.011455 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028663 | Grad Max: 0.146542 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000505 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005820 | Grad Max: 0.012846 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000290 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001438 | Grad Max: 0.004129 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000869 | Grad Max: 0.002528 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022444 | Grad Max: 0.022444 [GRADIENT NORM TOTAL] 10.6624 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.760 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.72100353 0.2789965 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.079 [MASKS] A(Pass/Fail): 715/1333 | B: 651/1397 | C: 547/1501 [LOSS Ex1] A: 0.63603 | B: 0.62035 | C: 0.61510 [LOGITS Ex2 A] Mean Abs: 2.137 | Max: 6.528 [LOSS Ex2] A: 0.12596 | B: 0.32925 | C: 0.22882 ** [JOINT LOSS] ** : 0.851835 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004086 | Grad Max: 0.124351 -> Layer: shared_layers.0.bias | Grad Mean: 0.125706 | Grad Max: 0.666765 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.006038 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003596 | Grad Max: 0.003596 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000899 | Grad Max: 0.128382 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015166 | Grad Max: 0.671980 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.003442 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004454 | Grad Max: 0.032305 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000200 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001042 | Grad Max: 0.003578 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000087 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000260 | Grad Max: 0.001043 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000303 | Grad Max: 0.001340 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004836 | Grad Max: 0.004836 [GRADIENT NORM TOTAL] 2.8002 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.878 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63233 0.36767] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.079 [MASKS] A(Pass/Fail): 590/1026 | B: 639/1409 | C: 542/1506 [LOSS Ex1] A: 0.63422 | B: 0.62091 | C: 0.61889 [LOGITS Ex2 A] Mean Abs: 2.230 | Max: 10.109 [LOSS Ex2] A: 0.10968 | B: 0.34092 | C: 0.26550 ** [JOINT LOSS] ** : 0.863376 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007658 | Grad Max: 0.266868 -> Layer: shared_layers.0.bias | Grad Mean: 0.650440 | Grad Max: 3.560274 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.007068 -> Layer: exit1_layers.0.bias | Grad Mean: 0.015286 | Grad Max: 0.015286 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004058 | Grad Max: 0.766253 -> Layer: exit2_layers.0.bias | Grad Mean: 0.075395 | Grad Max: 4.276077 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000454 | Grad Max: 0.015786 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037061 | Grad Max: 0.183044 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000616 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007394 | Grad Max: 0.015489 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000331 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001798 | Grad Max: 0.005130 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001009 | Grad Max: 0.002211 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026531 | Grad Max: 0.026531 [GRADIENT NORM TOTAL] 14.6233 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.079 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50764006 0.49235994] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 719/1329 | B: 639/1409 | C: 532/1516 [LOSS Ex1] A: 0.63490 | B: 0.61649 | C: 0.61880 [LOGITS Ex2 A] Mean Abs: 2.228 | Max: 7.319 [LOSS Ex2] A: 0.11030 | B: 0.32891 | C: 0.23689 ** [JOINT LOSS] ** : 0.848766 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010396 | Grad Max: 0.312777 -> Layer: shared_layers.0.bias | Grad Mean: 0.837695 | Grad Max: 4.114371 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.005773 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000999 | Grad Max: 0.000999 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005239 | Grad Max: 0.909402 -> Layer: exit2_layers.0.bias | Grad Mean: 0.096244 | Grad Max: 5.069708 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000608 | Grad Max: 0.019343 -> Layer: exit2_layers.3.bias | Grad Mean: 0.049728 | Grad Max: 0.252582 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000767 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010196 | Grad Max: 0.020576 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000432 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002555 | Grad Max: 0.006680 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001509 | Grad Max: 0.003226 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040459 | Grad Max: 0.040459 [GRADIENT NORM TOTAL] 18.2585 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.017 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5086216 0.4913784] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.079 [MASKS] A(Pass/Fail): 716/1332 | B: 606/1250 | C: 563/1485 [LOSS Ex1] A: 0.63164 | B: 0.62078 | C: 0.61174 [LOGITS Ex2 A] Mean Abs: 2.189 | Max: 7.132 [LOSS Ex2] A: 0.11788 | B: 0.30774 | C: 0.21658 ** [JOINT LOSS] ** : 0.835453 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007637 | Grad Max: 0.201215 -> Layer: shared_layers.0.bias | Grad Mean: 0.322263 | Grad Max: 1.522661 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002265 | Grad Max: 0.006834 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004150 | Grad Max: 0.004150 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.420166 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039232 | Grad Max: 2.340578 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000247 | Grad Max: 0.007775 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019293 | Grad Max: 0.103066 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000385 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004121 | Grad Max: 0.008513 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000193 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001090 | Grad Max: 0.002916 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000708 | Grad Max: 0.002142 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018613 | Grad Max: 0.018613 [GRADIENT NORM TOTAL] 7.2260 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.049 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50694525 0.49305472] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.079 [MASKS] A(Pass/Fail): 718/1330 | B: 651/1397 | C: 597/1451 [LOSS Ex1] A: 0.63064 | B: 0.62021 | C: 0.61482 [LOGITS Ex2 A] Mean Abs: 2.119 | Max: 6.024 [LOSS Ex2] A: 0.12182 | B: 0.35800 | C: 0.22645 ** [JOINT LOSS] ** : 0.857314 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005580 | Grad Max: 0.218806 -> Layer: shared_layers.0.bias | Grad Mean: 0.577120 | Grad Max: 2.928868 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002243 | Grad Max: 0.006485 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001076 | Grad Max: 0.001076 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003815 | Grad Max: 0.536136 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069176 | Grad Max: 2.974916 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.015248 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033994 | Grad Max: 0.202156 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000567 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006288 | Grad Max: 0.013492 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000271 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001565 | Grad Max: 0.004447 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000908 | Grad Max: 0.002429 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025589 | Grad Max: 0.025589 [GRADIENT NORM TOTAL] 13.0139 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.867 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012965 0.49870348] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.078 [MASKS] A(Pass/Fail): 689/1359 | B: 639/1409 | C: 397/979 [LOSS Ex1] A: 0.63938 | B: 0.62079 | C: 0.61068 [LOGITS Ex2 A] Mean Abs: 2.073 | Max: 6.309 [LOSS Ex2] A: 0.11391 | B: 0.36642 | C: 0.21314 ** [JOINT LOSS] ** : 0.854772 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005810 | Grad Max: 0.291340 -> Layer: shared_layers.0.bias | Grad Mean: 0.844417 | Grad Max: 3.924367 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.006459 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011998 | Grad Max: 0.011998 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005474 | Grad Max: 0.816300 -> Layer: exit2_layers.0.bias | Grad Mean: 0.102739 | Grad Max: 4.595547 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000669 | Grad Max: 0.025557 -> Layer: exit2_layers.3.bias | Grad Mean: 0.056064 | Grad Max: 0.317720 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000804 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011032 | Grad Max: 0.022531 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000468 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002729 | Grad Max: 0.007261 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001556 | Grad Max: 0.003464 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042838 | Grad Max: 0.042838 [GRADIENT NORM TOTAL] 19.0797 [EPOCH SUMMARY] Train Loss: 0.8485 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8357 | Alpha: 0.5500 No improve count: 4/15 ############################## EPOCH 144/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.781 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5439925 0.45600754] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.075 [MASKS] A(Pass/Fail): 686/1362 | B: 640/1408 | C: 568/1480 [LOSS Ex1] A: 0.63976 | B: 0.61637 | C: 0.61277 [LOGITS Ex2 A] Mean Abs: 2.069 | Max: 6.109 [LOSS Ex2] A: 0.11853 | B: 0.33169 | C: 0.22148 ** [JOINT LOSS] ** : 0.846867 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003979 | Grad Max: 0.209672 -> Layer: shared_layers.0.bias | Grad Mean: 0.559466 | Grad Max: 2.810719 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.005485 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003891 | Grad Max: 0.003891 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003648 | Grad Max: 0.630922 -> Layer: exit2_layers.0.bias | Grad Mean: 0.068159 | Grad Max: 3.519400 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000426 | Grad Max: 0.018429 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035578 | Grad Max: 0.227594 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000483 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006965 | Grad Max: 0.014082 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000329 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001711 | Grad Max: 0.004822 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000993 | Grad Max: 0.002527 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027106 | Grad Max: 0.027106 [GRADIENT NORM TOTAL] 12.9580 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.975 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7867267 0.21327327] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.079 [MASKS] A(Pass/Fail): 751/1297 | B: 606/1250 | C: 544/1504 [LOSS Ex1] A: 0.63342 | B: 0.62065 | C: 0.61964 [LOGITS Ex2 A] Mean Abs: 2.153 | Max: 6.684 [LOSS Ex2] A: 0.11276 | B: 0.31610 | C: 0.23583 ** [JOINT LOSS] ** : 0.846135 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005200 | Grad Max: 0.144395 -> Layer: shared_layers.0.bias | Grad Mean: 0.227397 | Grad Max: 1.022115 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.005805 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004487 | Grad Max: 0.004487 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001686 | Grad Max: 0.306162 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030595 | Grad Max: 1.572039 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000167 | Grad Max: 0.007300 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012891 | Grad Max: 0.081571 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000266 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002707 | Grad Max: 0.006267 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000144 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000700 | Grad Max: 0.001964 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000452 | Grad Max: 0.001547 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010929 | Grad Max: 0.010929 [GRADIENT NORM TOTAL] 5.4449 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.081 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50043803 0.49956197] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.078 [MASKS] A(Pass/Fail): 721/1327 | B: 651/1397 | C: 592/1456 [LOSS Ex1] A: 0.64041 | B: 0.62010 | C: 0.61021 [LOGITS Ex2 A] Mean Abs: 2.204 | Max: 5.631 [LOSS Ex2] A: 0.09986 | B: 0.33256 | C: 0.23054 ** [JOINT LOSS] ** : 0.844563 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004618 | Grad Max: 0.212585 -> Layer: shared_layers.0.bias | Grad Mean: 0.508400 | Grad Max: 2.679003 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.005896 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001412 | Grad Max: 0.001412 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003147 | Grad Max: 0.479446 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058960 | Grad Max: 2.678427 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.016286 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031801 | Grad Max: 0.192844 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000478 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006137 | Grad Max: 0.012487 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000256 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001538 | Grad Max: 0.004000 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000860 | Grad Max: 0.002462 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024046 | Grad Max: 0.024046 [GRADIENT NORM TOTAL] 11.1311 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.763 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7214717 0.27852824] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.079 [MASKS] A(Pass/Fail): 715/1333 | B: 639/1409 | C: 565/1483 [LOSS Ex1] A: 0.63588 | B: 0.62067 | C: 0.61614 [LOGITS Ex2 A] Mean Abs: 2.147 | Max: 6.414 [LOSS Ex2] A: 0.12112 | B: 0.33245 | C: 0.23543 ** [JOINT LOSS] ** : 0.853895 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004194 | Grad Max: 0.140329 -> Layer: shared_layers.0.bias | Grad Mean: 0.185621 | Grad Max: 0.922879 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.006331 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012276 | Grad Max: 0.012276 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001353 | Grad Max: 0.224931 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023019 | Grad Max: 1.212999 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.005403 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005925 | Grad Max: 0.051805 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000171 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000914 | Grad Max: 0.003278 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000080 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000221 | Grad Max: 0.001030 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000336 | Grad Max: 0.000848 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002821 | Grad Max: 0.002821 [GRADIENT NORM TOTAL] 4.3975 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.881 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6326799 0.3673201] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.079 [MASKS] A(Pass/Fail): 590/1026 | B: 640/1408 | C: 528/1520 [LOSS Ex1] A: 0.63407 | B: 0.61626 | C: 0.61937 [LOGITS Ex2 A] Mean Abs: 2.172 | Max: 8.857 [LOSS Ex2] A: 0.11542 | B: 0.31576 | C: 0.24602 ** [JOINT LOSS] ** : 0.848965 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006626 | Grad Max: 0.184106 -> Layer: shared_layers.0.bias | Grad Mean: 0.458099 | Grad Max: 1.768980 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002174 | Grad Max: 0.005952 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008004 | Grad Max: 0.008004 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002894 | Grad Max: 0.355227 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052936 | Grad Max: 1.972973 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000353 | Grad Max: 0.010628 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028861 | Grad Max: 0.136742 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000465 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005965 | Grad Max: 0.013031 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000268 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001506 | Grad Max: 0.004101 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000949 | Grad Max: 0.002343 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024775 | Grad Max: 0.024775 [GRADIENT NORM TOTAL] 9.3232 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.082 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50766796 0.49233207] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.080 [MASKS] A(Pass/Fail): 719/1329 | B: 606/1250 | C: 576/1472 [LOSS Ex1] A: 0.63475 | B: 0.62053 | C: 0.60945 [LOGITS Ex2 A] Mean Abs: 2.146 | Max: 8.388 [LOSS Ex2] A: 0.10034 | B: 0.32202 | C: 0.22818 ** [JOINT LOSS] ** : 0.838425 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005399 | Grad Max: 0.179016 -> Layer: shared_layers.0.bias | Grad Mean: 0.375441 | Grad Max: 1.640949 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002207 | Grad Max: 0.006018 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001419 | Grad Max: 0.001419 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002400 | Grad Max: 0.601190 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044207 | Grad Max: 3.350138 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000268 | Grad Max: 0.010005 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021772 | Grad Max: 0.121679 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000408 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004437 | Grad Max: 0.009814 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000218 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001094 | Grad Max: 0.003047 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000680 | Grad Max: 0.002107 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017152 | Grad Max: 0.017152 [GRADIENT NORM TOTAL] 8.4885 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.021 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50858766 0.4914123 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.079 [MASKS] A(Pass/Fail): 716/1332 | B: 651/1397 | C: 553/1495 [LOSS Ex1] A: 0.63149 | B: 0.61999 | C: 0.61499 [LOGITS Ex2 A] Mean Abs: 2.160 | Max: 6.426 [LOSS Ex2] A: 0.10788 | B: 0.32937 | C: 0.22653 ** [JOINT LOSS] ** : 0.843415 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002220 | Grad Max: 0.081100 -> Layer: shared_layers.0.bias | Grad Mean: 0.151681 | Grad Max: 0.824321 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002312 | Grad Max: 0.006854 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007855 | Grad Max: 0.007855 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001165 | Grad Max: 0.226960 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021225 | Grad Max: 1.256246 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000139 | Grad Max: 0.005615 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011303 | Grad Max: 0.069698 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000241 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002255 | Grad Max: 0.005452 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000137 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000570 | Grad Max: 0.001816 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000431 | Grad Max: 0.001521 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009246 | Grad Max: 0.009246 [GRADIENT NORM TOTAL] 3.8918 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.052 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50698876 0.4930112 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.079 [MASKS] A(Pass/Fail): 718/1330 | B: 639/1409 | C: 557/1491 [LOSS Ex1] A: 0.63048 | B: 0.62056 | C: 0.61263 [LOGITS Ex2 A] Mean Abs: 2.147 | Max: 8.405 [LOSS Ex2] A: 0.13247 | B: 0.32894 | C: 0.21181 ** [JOINT LOSS] ** : 0.845633 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004217 | Grad Max: 0.150206 -> Layer: shared_layers.0.bias | Grad Mean: 0.221224 | Grad Max: 0.996747 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.006250 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003137 | Grad Max: 0.003137 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001661 | Grad Max: 0.277787 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029116 | Grad Max: 1.552037 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000176 | Grad Max: 0.005775 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014124 | Grad Max: 0.080384 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000322 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003062 | Grad Max: 0.006877 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000156 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000783 | Grad Max: 0.002120 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000469 | Grad Max: 0.001667 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012237 | Grad Max: 0.012237 [GRADIENT NORM TOTAL] 5.2845 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.870 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012607 0.4987393] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.078 [MASKS] A(Pass/Fail): 689/1359 | B: 640/1408 | C: 541/1507 [LOSS Ex1] A: 0.63923 | B: 0.61614 | C: 0.61795 [LOGITS Ex2 A] Mean Abs: 2.093 | Max: 7.001 [LOSS Ex2] A: 0.11255 | B: 0.31356 | C: 0.23151 ** [JOINT LOSS] ** : 0.843648 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003358 | Grad Max: 0.111901 -> Layer: shared_layers.0.bias | Grad Mean: 0.319543 | Grad Max: 1.600701 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005496 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003189 | Grad Max: 0.003189 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.555696 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038414 | Grad Max: 3.117706 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000225 | Grad Max: 0.009133 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018602 | Grad Max: 0.109195 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000317 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003554 | Grad Max: 0.008189 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000181 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000873 | Grad Max: 0.002636 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000547 | Grad Max: 0.001956 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014023 | Grad Max: 0.014023 [GRADIENT NORM TOTAL] 7.9188 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.783 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.543993 0.45600697] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.075 [MASKS] A(Pass/Fail): 686/1362 | B: 606/1250 | C: 582/1466 [LOSS Ex1] A: 0.63961 | B: 0.62041 | C: 0.61488 [LOGITS Ex2 A] Mean Abs: 2.078 | Max: 5.666 [LOSS Ex2] A: 0.11698 | B: 0.31869 | C: 0.22577 ** [JOINT LOSS] ** : 0.845446 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004027 | Grad Max: 0.129699 -> Layer: shared_layers.0.bias | Grad Mean: 0.388502 | Grad Max: 1.756773 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005624 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001101 | Grad Max: 0.001101 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002358 | Grad Max: 0.418619 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043948 | Grad Max: 2.344821 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000295 | Grad Max: 0.011931 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024383 | Grad Max: 0.153657 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000408 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004756 | Grad Max: 0.010466 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000239 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001163 | Grad Max: 0.003541 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000666 | Grad Max: 0.002064 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017548 | Grad Max: 0.017548 [GRADIENT NORM TOTAL] 8.2558 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.979 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7875203 0.21247965] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.079 [MASKS] A(Pass/Fail): 751/1297 | B: 651/1397 | C: 610/1438 [LOSS Ex1] A: 0.63326 | B: 0.61987 | C: 0.60970 [LOGITS Ex2 A] Mean Abs: 2.163 | Max: 6.467 [LOSS Ex2] A: 0.10829 | B: 0.34245 | C: 0.22195 ** [JOINT LOSS] ** : 0.845176 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003433 | Grad Max: 0.135451 -> Layer: shared_layers.0.bias | Grad Mean: 0.226094 | Grad Max: 1.378317 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002280 | Grad Max: 0.006700 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005872 | Grad Max: 0.005872 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001610 | Grad Max: 0.304799 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029242 | Grad Max: 1.677679 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000156 | Grad Max: 0.007844 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012831 | Grad Max: 0.085776 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000020 | Grad Max: 0.000264 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002632 | Grad Max: 0.005910 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000132 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000666 | Grad Max: 0.001785 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.001453 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010287 | Grad Max: 0.010287 [GRADIENT NORM TOTAL] 5.6282 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.085 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004259 0.49957415] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.078 [MASKS] A(Pass/Fail): 721/1327 | B: 639/1409 | C: 587/1461 [LOSS Ex1] A: 0.64025 | B: 0.62044 | C: 0.61504 [LOGITS Ex2 A] Mean Abs: 2.176 | Max: 6.013 [LOSS Ex2] A: 0.10085 | B: 0.32193 | C: 0.24132 ** [JOINT LOSS] ** : 0.846605 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002486 | Grad Max: 0.097018 -> Layer: shared_layers.0.bias | Grad Mean: 0.239905 | Grad Max: 1.301092 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.005567 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003670 | Grad Max: 0.003670 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001596 | Grad Max: 0.351888 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029354 | Grad Max: 1.981730 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000154 | Grad Max: 0.008158 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012542 | Grad Max: 0.080395 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000208 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002357 | Grad Max: 0.006111 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000121 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000562 | Grad Max: 0.001679 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000388 | Grad Max: 0.001289 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007575 | Grad Max: 0.007575 [GRADIENT NORM TOTAL] 5.7227 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.766 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7220352 0.27796477] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.079 [MASKS] A(Pass/Fail): 715/1333 | B: 640/1408 | C: 560/1488 [LOSS Ex1] A: 0.63571 | B: 0.61601 | C: 0.61360 [LOGITS Ex2 A] Mean Abs: 2.152 | Max: 6.901 [LOSS Ex2] A: 0.12627 | B: 0.31520 | C: 0.20342 ** [JOINT LOSS] ** : 0.836737 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004545 | Grad Max: 0.117885 -> Layer: shared_layers.0.bias | Grad Mean: 0.201600 | Grad Max: 1.006311 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002194 | Grad Max: 0.005781 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008244 | Grad Max: 0.008244 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001432 | Grad Max: 0.199591 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025797 | Grad Max: 1.096842 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000178 | Grad Max: 0.005440 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014276 | Grad Max: 0.067627 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000298 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002894 | Grad Max: 0.006897 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000169 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000697 | Grad Max: 0.002255 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000464 | Grad Max: 0.001797 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010717 | Grad Max: 0.010717 [GRADIENT NORM TOTAL] 4.3756 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.885 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6329312 0.36706886] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.080 [MASKS] A(Pass/Fail): 590/1026 | B: 606/1250 | C: 392/984 [LOSS Ex1] A: 0.63390 | B: 0.62027 | C: 0.62253 [LOGITS Ex2 A] Mean Abs: 2.207 | Max: 8.630 [LOSS Ex2] A: 0.10997 | B: 0.31154 | C: 0.24962 ** [JOINT LOSS] ** : 0.849277 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002866 | Grad Max: 0.045399 -> Layer: shared_layers.0.bias | Grad Mean: 0.109826 | Grad Max: 0.568672 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.006154 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011889 | Grad Max: 0.011890 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000820 | Grad Max: 0.435877 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013870 | Grad Max: 2.442509 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.003463 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002624 | Grad Max: 0.029198 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000141 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000452 | Grad Max: 0.002617 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000091 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000116 | Grad Max: 0.001077 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000298 | Grad Max: 0.000967 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001418 | Grad Max: 0.001418 [GRADIENT NORM TOTAL] 3.8524 [EPOCH SUMMARY] Train Loss: 0.8453 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8277 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8312 -> New: 0.8277) ############################## EPOCH 145/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.087 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5076589 0.4923411] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.080 [MASKS] A(Pass/Fail): 720/1328 | B: 651/1397 | C: 517/1531 [LOSS Ex1] A: 0.63458 | B: 0.61973 | C: 0.61947 [LOGITS Ex2 A] Mean Abs: 2.212 | Max: 7.610 [LOSS Ex2] A: 0.10750 | B: 0.33266 | C: 0.23096 ** [JOINT LOSS] ** : 0.848298 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003898 | Grad Max: 0.187776 -> Layer: shared_layers.0.bias | Grad Mean: 0.357691 | Grad Max: 2.182126 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.005430 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000830 | Grad Max: 0.000830 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002248 | Grad Max: 0.396512 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041631 | Grad Max: 2.209554 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000279 | Grad Max: 0.009751 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023144 | Grad Max: 0.119491 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000372 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004564 | Grad Max: 0.009989 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000213 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001131 | Grad Max: 0.003079 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000634 | Grad Max: 0.001925 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017167 | Grad Max: 0.017167 [GRADIENT NORM TOTAL] 7.8711 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.025 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50850946 0.49149057] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 716/1332 | B: 639/1409 | C: 546/1502 [LOSS Ex1] A: 0.63131 | B: 0.62029 | C: 0.61715 [LOGITS Ex2 A] Mean Abs: 2.196 | Max: 5.574 [LOSS Ex2] A: 0.11435 | B: 0.31093 | C: 0.24369 ** [JOINT LOSS] ** : 0.845905 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002667 | Grad Max: 0.059048 -> Layer: shared_layers.0.bias | Grad Mean: 0.131611 | Grad Max: 0.653112 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.006204 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001624 | Grad Max: 0.001624 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001168 | Grad Max: 0.491435 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021037 | Grad Max: 2.731622 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.005102 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007977 | Grad Max: 0.054382 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001629 | Grad Max: 0.004547 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000109 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000385 | Grad Max: 0.001418 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000354 | Grad Max: 0.001037 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004718 | Grad Max: 0.004718 [GRADIENT NORM TOTAL] 4.9785 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.056 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50704795 0.49295205] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.079 [MASKS] A(Pass/Fail): 718/1330 | B: 640/1408 | C: 574/1474 [LOSS Ex1] A: 0.63030 | B: 0.61587 | C: 0.61323 [LOGITS Ex2 A] Mean Abs: 2.115 | Max: 8.705 [LOSS Ex2] A: 0.12026 | B: 0.32734 | C: 0.23074 ** [JOINT LOSS] ** : 0.845911 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004393 | Grad Max: 0.171797 -> Layer: shared_layers.0.bias | Grad Mean: 0.457066 | Grad Max: 2.096837 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.006440 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003439 | Grad Max: 0.003439 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002843 | Grad Max: 0.435505 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052842 | Grad Max: 2.398676 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000320 | Grad Max: 0.013678 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026583 | Grad Max: 0.182173 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000397 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005220 | Grad Max: 0.010378 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000234 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001341 | Grad Max: 0.003580 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000794 | Grad Max: 0.002093 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021861 | Grad Max: 0.021861 [GRADIENT NORM TOTAL] 9.7300 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.873 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012327 0.49876732] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.078 [MASKS] A(Pass/Fail): 689/1359 | B: 606/1250 | C: 560/1488 [LOSS Ex1] A: 0.63905 | B: 0.62012 | C: 0.61501 [LOGITS Ex2 A] Mean Abs: 2.126 | Max: 6.549 [LOSS Ex2] A: 0.10718 | B: 0.31615 | C: 0.21506 ** [JOINT LOSS] ** : 0.837522 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004129 | Grad Max: 0.133722 -> Layer: shared_layers.0.bias | Grad Mean: 0.427546 | Grad Max: 1.781581 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005675 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005884 | Grad Max: 0.005884 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002811 | Grad Max: 0.431505 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052323 | Grad Max: 2.447614 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.011056 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027558 | Grad Max: 0.151617 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000472 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005504 | Grad Max: 0.012200 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000280 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001359 | Grad Max: 0.004077 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000787 | Grad Max: 0.002398 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020542 | Grad Max: 0.020542 [GRADIENT NORM TOTAL] 9.3527 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.129 | Max: 0.787 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54398817 0.45601183] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.075 [MASKS] A(Pass/Fail): 686/1362 | B: 651/1397 | C: 582/1466 [LOSS Ex1] A: 0.63944 | B: 0.61959 | C: 0.61456 [LOGITS Ex2 A] Mean Abs: 2.148 | Max: 6.174 [LOSS Ex2] A: 0.11727 | B: 0.33097 | C: 0.27193 ** [JOINT LOSS] ** : 0.864590 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003756 | Grad Max: 0.132255 -> Layer: shared_layers.0.bias | Grad Mean: 0.269843 | Grad Max: 1.464830 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.005306 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001652 | Grad Max: 0.001652 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001926 | Grad Max: 0.494787 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035569 | Grad Max: 2.762136 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.007671 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016481 | Grad Max: 0.088233 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000321 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003187 | Grad Max: 0.007875 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000757 | Grad Max: 0.002821 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000377 | Grad Max: 0.001115 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010524 | Grad Max: 0.010524 [GRADIENT NORM TOTAL] 7.1540 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.982 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7884864 0.2115136] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.080 [MASKS] A(Pass/Fail): 751/1297 | B: 639/1409 | C: 620/1428 [LOSS Ex1] A: 0.63308 | B: 0.62015 | C: 0.60858 [LOGITS Ex2 A] Mean Abs: 2.191 | Max: 6.591 [LOSS Ex2] A: 0.11063 | B: 0.33823 | C: 0.23122 ** [JOINT LOSS] ** : 0.847299 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005195 | Grad Max: 0.165205 -> Layer: shared_layers.0.bias | Grad Mean: 0.445922 | Grad Max: 2.195778 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.006435 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001914 | Grad Max: 0.001914 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002876 | Grad Max: 0.657604 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053177 | Grad Max: 3.663275 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000314 | Grad Max: 0.012129 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026110 | Grad Max: 0.165184 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000439 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005179 | Grad Max: 0.011238 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000244 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001294 | Grad Max: 0.003468 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000702 | Grad Max: 0.002055 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019176 | Grad Max: 0.019176 [GRADIENT NORM TOTAL] 10.3242 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 1.089 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003996 0.4996004] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.079 [MASKS] A(Pass/Fail): 721/1327 | B: 640/1408 | C: 562/1486 [LOSS Ex1] A: 0.64008 | B: 0.61573 | C: 0.61516 [LOGITS Ex2 A] Mean Abs: 2.181 | Max: 6.313 [LOSS Ex2] A: 0.09786 | B: 0.31362 | C: 0.22831 ** [JOINT LOSS] ** : 0.836919 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002645 | Grad Max: 0.064875 -> Layer: shared_layers.0.bias | Grad Mean: 0.121638 | Grad Max: 0.481712 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.005552 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002551 | Grad Max: 0.002551 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000911 | Grad Max: 0.427436 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015601 | Grad Max: 2.353610 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.003228 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002480 | Grad Max: 0.022084 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000172 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000553 | Grad Max: 0.002880 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000065 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000150 | Grad Max: 0.000671 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000422 | Grad Max: 0.001157 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002073 | Grad Max: 0.002073 [GRADIENT NORM TOTAL] 4.5116 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.769 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.72273517 0.2772648 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.080 [MASKS] A(Pass/Fail): 714/1334 | B: 606/1250 | C: 566/1482 [LOSS Ex1] A: 0.63551 | B: 0.61997 | C: 0.61616 [LOGITS Ex2 A] Mean Abs: 2.124 | Max: 6.159 [LOSS Ex2] A: 0.13097 | B: 0.32159 | C: 0.23019 ** [JOINT LOSS] ** : 0.851467 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006062 | Grad Max: 0.160743 -> Layer: shared_layers.0.bias | Grad Mean: 0.368605 | Grad Max: 1.865063 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.006031 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003941 | Grad Max: 0.003941 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002291 | Grad Max: 0.239275 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041996 | Grad Max: 1.323700 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.011356 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024750 | Grad Max: 0.136553 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000436 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004995 | Grad Max: 0.010863 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000282 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001241 | Grad Max: 0.003900 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000801 | Grad Max: 0.002331 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019945 | Grad Max: 0.019945 [GRADIENT NORM TOTAL] 7.1375 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.889 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6332908 0.36670914] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.080 [MASKS] A(Pass/Fail): 590/1026 | B: 651/1397 | C: 580/1468 [LOSS Ex1] A: 0.63370 | B: 0.61945 | C: 0.61182 [LOGITS Ex2 A] Mean Abs: 2.212 | Max: 7.305 [LOSS Ex2] A: 0.10935 | B: 0.32683 | C: 0.21829 ** [JOINT LOSS] ** : 0.839812 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002474 | Grad Max: 0.061440 -> Layer: shared_layers.0.bias | Grad Mean: 0.089233 | Grad Max: 0.463665 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002154 | Grad Max: 0.006390 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005964 | Grad Max: 0.005964 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000624 | Grad Max: 0.182248 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010429 | Grad Max: 0.994864 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.003369 -> Layer: exit2_layers.3.bias | Grad Mean: 0.001933 | Grad Max: 0.016696 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000120 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000339 | Grad Max: 0.002066 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000069 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000084 | Grad Max: 0.000571 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000387 | Grad Max: 0.001303 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001083 | Grad Max: 0.001083 [GRADIENT NORM TOTAL] 2.3812 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.091 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5077049 0.49229512] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.080 [MASKS] A(Pass/Fail): 720/1328 | B: 640/1408 | C: 590/1458 [LOSS Ex1] A: 0.63438 | B: 0.62000 | C: 0.61237 [LOGITS Ex2 A] Mean Abs: 2.210 | Max: 7.668 [LOSS Ex2] A: 0.10443 | B: 0.32732 | C: 0.22212 ** [JOINT LOSS] ** : 0.840210 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004381 | Grad Max: 0.122634 -> Layer: shared_layers.0.bias | Grad Mean: 0.365574 | Grad Max: 1.700921 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.005641 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003466 | Grad Max: 0.003466 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002459 | Grad Max: 0.437562 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044905 | Grad Max: 2.446554 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000263 | Grad Max: 0.011666 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021853 | Grad Max: 0.144821 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000423 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004408 | Grad Max: 0.011290 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000235 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001093 | Grad Max: 0.003242 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000562 | Grad Max: 0.001870 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015896 | Grad Max: 0.015896 [GRADIENT NORM TOTAL] 8.7082 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.029 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50845283 0.49154717] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 716/1332 | B: 641/1407 | C: 570/1478 [LOSS Ex1] A: 0.63110 | B: 0.61557 | C: 0.61395 [LOGITS Ex2 A] Mean Abs: 2.193 | Max: 6.420 [LOSS Ex2] A: 0.11226 | B: 0.30101 | C: 0.22535 ** [JOINT LOSS] ** : 0.833080 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003754 | Grad Max: 0.110798 -> Layer: shared_layers.0.bias | Grad Mean: 0.215554 | Grad Max: 1.182494 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002274 | Grad Max: 0.006043 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002331 | Grad Max: 0.002331 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001584 | Grad Max: 0.499442 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028659 | Grad Max: 2.769616 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000146 | Grad Max: 0.005523 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011895 | Grad Max: 0.065254 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000241 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002465 | Grad Max: 0.005671 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000130 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000618 | Grad Max: 0.001801 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000431 | Grad Max: 0.001585 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009679 | Grad Max: 0.009679 [GRADIENT NORM TOTAL] 6.1557 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.061 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50718826 0.49281177] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 718/1330 | B: 606/1250 | C: 581/1467 [LOSS Ex1] A: 0.63008 | B: 0.61981 | C: 0.61185 [LOGITS Ex2 A] Mean Abs: 2.143 | Max: 7.256 [LOSS Ex2] A: 0.12533 | B: 0.31602 | C: 0.17732 ** [JOINT LOSS] ** : 0.826804 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003775 | Grad Max: 0.159701 -> Layer: shared_layers.0.bias | Grad Mean: 0.394557 | Grad Max: 2.373050 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002256 | Grad Max: 0.006587 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003249 | Grad Max: 0.003249 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002558 | Grad Max: 0.561198 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046847 | Grad Max: 3.126805 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000249 | Grad Max: 0.010957 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020629 | Grad Max: 0.117225 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000322 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003920 | Grad Max: 0.009615 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000224 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000935 | Grad Max: 0.003308 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000532 | Grad Max: 0.002233 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013608 | Grad Max: 0.013608 [GRADIENT NORM TOTAL] 9.5828 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.877 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012676 0.49873236] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.078 [MASKS] A(Pass/Fail): 689/1359 | B: 651/1397 | C: 562/1486 [LOSS Ex1] A: 0.63885 | B: 0.61928 | C: 0.61558 [LOGITS Ex2 A] Mean Abs: 2.113 | Max: 6.116 [LOSS Ex2] A: 0.10498 | B: 0.34055 | C: 0.23299 ** [JOINT LOSS] ** : 0.850743 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005101 | Grad Max: 0.178039 -> Layer: shared_layers.0.bias | Grad Mean: 0.489628 | Grad Max: 2.362270 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.005671 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006572 | Grad Max: 0.006572 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003139 | Grad Max: 0.474675 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058124 | Grad Max: 2.657853 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000343 | Grad Max: 0.011229 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028696 | Grad Max: 0.145429 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000476 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005805 | Grad Max: 0.011949 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000275 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001474 | Grad Max: 0.004148 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000956 | Grad Max: 0.002410 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024350 | Grad Max: 0.024350 [GRADIENT NORM TOTAL] 10.6460 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.791 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5440161 0.45598385] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.076 [MASKS] A(Pass/Fail): 686/1362 | B: 640/1408 | C: 355/1021 [LOSS Ex1] A: 0.63925 | B: 0.61984 | C: 0.61567 [LOGITS Ex2 A] Mean Abs: 2.143 | Max: 5.402 [LOSS Ex2] A: 0.11190 | B: 0.32342 | C: 0.22403 ** [JOINT LOSS] ** : 0.844706 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003082 | Grad Max: 0.077064 -> Layer: shared_layers.0.bias | Grad Mean: 0.157031 | Grad Max: 0.754877 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005667 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008876 | Grad Max: 0.008876 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001241 | Grad Max: 0.262237 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022122 | Grad Max: 1.447779 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000093 | Grad Max: 0.004252 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007418 | Grad Max: 0.041040 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000186 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001442 | Grad Max: 0.004626 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000113 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000351 | Grad Max: 0.001220 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000438 | Grad Max: 0.001291 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004673 | Grad Max: 0.004673 [GRADIENT NORM TOTAL] 4.2061 [EPOCH SUMMARY] Train Loss: 0.8438 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8251 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8277 -> New: 0.8251) ############################## EPOCH 146/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.987 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.78961784 0.21038209] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.080 [MASKS] A(Pass/Fail): 751/1297 | B: 643/1405 | C: 590/1458 [LOSS Ex1] A: 0.63287 | B: 0.61541 | C: 0.61335 [LOGITS Ex2 A] Mean Abs: 2.194 | Max: 7.066 [LOSS Ex2] A: 0.10649 | B: 0.30606 | C: 0.24467 ** [JOINT LOSS] ** : 0.839617 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004787 | Grad Max: 0.145319 -> Layer: shared_layers.0.bias | Grad Mean: 0.331769 | Grad Max: 1.878395 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.005696 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004097 | Grad Max: 0.004097 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002052 | Grad Max: 0.522220 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037699 | Grad Max: 2.915953 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000226 | Grad Max: 0.010202 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018613 | Grad Max: 0.111564 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000317 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003705 | Grad Max: 0.008198 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000175 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000898 | Grad Max: 0.002544 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000484 | Grad Max: 0.001603 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013384 | Grad Max: 0.013384 [GRADIENT NORM TOTAL] 7.8773 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.095 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003084 0.4996916] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.079 [MASKS] A(Pass/Fail): 721/1327 | B: 607/1249 | C: 553/1495 [LOSS Ex1] A: 0.63988 | B: 0.61966 | C: 0.61669 [LOGITS Ex2 A] Mean Abs: 2.180 | Max: 6.000 [LOSS Ex2] A: 0.09448 | B: 0.30786 | C: 0.22947 ** [JOINT LOSS] ** : 0.836010 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002714 | Grad Max: 0.073461 -> Layer: shared_layers.0.bias | Grad Mean: 0.175259 | Grad Max: 0.731726 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005985 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000024 | Grad Max: 0.000024 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001203 | Grad Max: 0.178721 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021450 | Grad Max: 0.989484 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.004633 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009258 | Grad Max: 0.055175 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000232 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001844 | Grad Max: 0.005013 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000113 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000439 | Grad Max: 0.001383 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000396 | Grad Max: 0.001549 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007204 | Grad Max: 0.007204 [GRADIENT NORM TOTAL] 3.8323 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.773 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7236364 0.27636364] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.080 [MASKS] A(Pass/Fail): 714/1334 | B: 652/1396 | C: 565/1483 [LOSS Ex1] A: 0.63529 | B: 0.61914 | C: 0.61406 [LOGITS Ex2 A] Mean Abs: 2.158 | Max: 7.239 [LOSS Ex2] A: 0.12171 | B: 0.32359 | C: 0.23274 ** [JOINT LOSS] ** : 0.848840 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002395 | Grad Max: 0.047443 -> Layer: shared_layers.0.bias | Grad Mean: 0.128585 | Grad Max: 0.548426 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.005671 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003302 | Grad Max: 0.003302 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001022 | Grad Max: 0.479250 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018177 | Grad Max: 2.684140 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.004992 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006258 | Grad Max: 0.051340 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000196 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001209 | Grad Max: 0.004001 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000302 | Grad Max: 0.001250 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000300 | Grad Max: 0.001414 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005084 | Grad Max: 0.005084 [GRADIENT NORM TOTAL] 4.3335 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.893 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63357925 0.36642075] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.080 [MASKS] A(Pass/Fail): 590/1026 | B: 641/1407 | C: 551/1497 [LOSS Ex1] A: 0.63345 | B: 0.61969 | C: 0.61513 [LOGITS Ex2 A] Mean Abs: 2.249 | Max: 7.939 [LOSS Ex2] A: 0.10721 | B: 0.32874 | C: 0.25965 ** [JOINT LOSS] ** : 0.854623 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003676 | Grad Max: 0.187723 -> Layer: shared_layers.0.bias | Grad Mean: 0.472153 | Grad Max: 2.695975 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.005693 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006200 | Grad Max: 0.006200 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002907 | Grad Max: 0.477784 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053756 | Grad Max: 2.631457 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000313 | Grad Max: 0.011997 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026338 | Grad Max: 0.154328 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000389 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005030 | Grad Max: 0.011208 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000239 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001196 | Grad Max: 0.003810 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000585 | Grad Max: 0.001627 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016501 | Grad Max: 0.016501 [GRADIENT NORM TOTAL] 10.5109 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.098 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50780094 0.49219906] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 721/1327 | B: 643/1405 | C: 540/1508 [LOSS Ex1] A: 0.63416 | B: 0.61526 | C: 0.61997 [LOGITS Ex2 A] Mean Abs: 2.222 | Max: 9.099 [LOSS Ex2] A: 0.10376 | B: 0.31114 | C: 0.22388 ** [JOINT LOSS] ** : 0.836057 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002791 | Grad Max: 0.118336 -> Layer: shared_layers.0.bias | Grad Mean: 0.257826 | Grad Max: 1.479319 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002151 | Grad Max: 0.005807 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007274 | Grad Max: 0.007274 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001723 | Grad Max: 0.381173 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031463 | Grad Max: 2.123986 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000163 | Grad Max: 0.006116 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013549 | Grad Max: 0.076324 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000242 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002740 | Grad Max: 0.006447 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000158 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000685 | Grad Max: 0.002070 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000444 | Grad Max: 0.001637 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010411 | Grad Max: 0.010411 [GRADIENT NORM TOTAL] 6.1728 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.034 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50831866 0.49168137] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 716/1332 | B: 607/1249 | C: 554/1494 [LOSS Ex1] A: 0.63087 | B: 0.61951 | C: 0.61824 [LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.859 [LOSS Ex2] A: 0.11204 | B: 0.30859 | C: 0.21088 ** [JOINT LOSS] ** : 0.833374 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003512 | Grad Max: 0.115865 -> Layer: shared_layers.0.bias | Grad Mean: 0.329946 | Grad Max: 1.636300 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002306 | Grad Max: 0.006727 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010923 | Grad Max: 0.010923 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002027 | Grad Max: 0.627280 -> Layer: exit2_layers.0.bias | Grad Mean: 0.037904 | Grad Max: 3.488873 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000211 | Grad Max: 0.007900 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017570 | Grad Max: 0.107716 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000331 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003424 | Grad Max: 0.007552 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000173 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000825 | Grad Max: 0.002444 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000494 | Grad Max: 0.001777 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012387 | Grad Max: 0.012387 [GRADIENT NORM TOTAL] 8.2135 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.066 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073336 0.49266642] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 718/1330 | B: 652/1396 | C: 544/1504 [LOSS Ex1] A: 0.62985 | B: 0.61900 | C: 0.61705 [LOGITS Ex2 A] Mean Abs: 2.148 | Max: 6.344 [LOSS Ex2] A: 0.12762 | B: 0.34256 | C: 0.23145 ** [JOINT LOSS] ** : 0.855839 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004829 | Grad Max: 0.193693 -> Layer: shared_layers.0.bias | Grad Mean: 0.450795 | Grad Max: 2.567899 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.005881 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000923 | Grad Max: 0.000923 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002895 | Grad Max: 0.706003 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052685 | Grad Max: 3.922365 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000304 | Grad Max: 0.010742 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024943 | Grad Max: 0.135207 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000404 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004599 | Grad Max: 0.009161 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000227 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001175 | Grad Max: 0.003336 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000724 | Grad Max: 0.001894 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019636 | Grad Max: 0.019636 [GRADIENT NORM TOTAL] 10.8162 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.882 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50129914 0.49870086] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.079 [MASKS] A(Pass/Fail): 688/1360 | B: 641/1407 | C: 583/1465 [LOSS Ex1] A: 0.63863 | B: 0.61955 | C: 0.60907 [LOGITS Ex2 A] Mean Abs: 2.168 | Max: 5.791 [LOSS Ex2] A: 0.10348 | B: 0.32288 | C: 0.21869 ** [JOINT LOSS] ** : 0.837433 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002439 | Grad Max: 0.054570 -> Layer: shared_layers.0.bias | Grad Mean: 0.137617 | Grad Max: 0.755732 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.005877 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002217 | Grad Max: 0.002217 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000850 | Grad Max: 0.477638 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014942 | Grad Max: 2.662036 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000044 | Grad Max: 0.003124 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002334 | Grad Max: 0.024040 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000105 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000420 | Grad Max: 0.002161 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000089 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000114 | Grad Max: 0.000737 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000443 | Grad Max: 0.001012 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001113 | Grad Max: 0.001113 [GRADIENT NORM TOTAL] 4.4797 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.794 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438778 0.4561222] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.076 [MASKS] A(Pass/Fail): 686/1362 | B: 644/1404 | C: 556/1492 [LOSS Ex1] A: 0.63904 | B: 0.61512 | C: 0.61966 [LOGITS Ex2 A] Mean Abs: 2.145 | Max: 6.043 [LOSS Ex2] A: 0.11016 | B: 0.30988 | C: 0.21667 ** [JOINT LOSS] ** : 0.836843 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003313 | Grad Max: 0.127864 -> Layer: shared_layers.0.bias | Grad Mean: 0.330760 | Grad Max: 1.401525 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.005510 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003366 | Grad Max: 0.003366 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002241 | Grad Max: 0.291886 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041719 | Grad Max: 1.629793 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000251 | Grad Max: 0.011097 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020920 | Grad Max: 0.124850 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000381 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004144 | Grad Max: 0.008831 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000225 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001039 | Grad Max: 0.003100 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000619 | Grad Max: 0.002012 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016660 | Grad Max: 0.016660 [GRADIENT NORM TOTAL] 7.3066 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.992 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7905287 0.20947124] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.080 [MASKS] A(Pass/Fail): 751/1297 | B: 607/1249 | C: 564/1484 [LOSS Ex1] A: 0.63266 | B: 0.61937 | C: 0.61254 [LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.833 [LOSS Ex2] A: 0.10771 | B: 0.30665 | C: 0.21544 ** [JOINT LOSS] ** : 0.831460 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002482 | Grad Max: 0.068362 -> Layer: shared_layers.0.bias | Grad Mean: 0.098755 | Grad Max: 0.364832 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.006461 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006555 | Grad Max: 0.006555 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000730 | Grad Max: 0.156309 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012825 | Grad Max: 0.862393 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.002993 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004342 | Grad Max: 0.033693 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000168 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000699 | Grad Max: 0.003516 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000085 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000163 | Grad Max: 0.001001 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000459 | Grad Max: 0.001213 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000754 | Grad Max: 0.000754 [GRADIENT NORM TOTAL] 2.2772 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50027436 0.4997257 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.079 [MASKS] A(Pass/Fail): 721/1327 | B: 652/1396 | C: 608/1440 [LOSS Ex1] A: 0.63968 | B: 0.61885 | C: 0.61151 [LOGITS Ex2 A] Mean Abs: 2.219 | Max: 6.423 [LOSS Ex2] A: 0.10044 | B: 0.32406 | C: 0.22805 ** [JOINT LOSS] ** : 0.840866 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002295 | Grad Max: 0.063146 -> Layer: shared_layers.0.bias | Grad Mean: 0.102668 | Grad Max: 0.545969 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.005623 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003361 | Grad Max: 0.003361 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000696 | Grad Max: 0.244517 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011589 | Grad Max: 1.354017 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.003529 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004023 | Grad Max: 0.038273 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000143 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000602 | Grad Max: 0.002989 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000118 | Grad Max: 0.000729 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000341 | Grad Max: 0.000992 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000346 | Grad Max: 0.000346 [GRADIENT NORM TOTAL] 2.5581 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.777 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.72429097 0.27570903] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.080 [MASKS] A(Pass/Fail): 714/1334 | B: 641/1407 | C: 581/1467 [LOSS Ex1] A: 0.63508 | B: 0.61939 | C: 0.61210 [LOGITS Ex2 A] Mean Abs: 2.183 | Max: 6.060 [LOSS Ex2] A: 0.11627 | B: 0.32520 | C: 0.23020 ** [JOINT LOSS] ** : 0.846076 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005200 | Grad Max: 0.138344 -> Layer: shared_layers.0.bias | Grad Mean: 0.222020 | Grad Max: 1.130686 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.005769 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007912 | Grad Max: 0.007912 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001539 | Grad Max: 0.524224 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027586 | Grad Max: 2.948638 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.006236 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013324 | Grad Max: 0.072691 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000313 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002822 | Grad Max: 0.006321 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000172 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000713 | Grad Max: 0.002323 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000489 | Grad Max: 0.001836 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011319 | Grad Max: 0.011319 [GRADIENT NORM TOTAL] 5.6047 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.898 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63382417 0.36617583] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.080 [MASKS] A(Pass/Fail): 590/1026 | B: 644/1404 | C: 588/1460 [LOSS Ex1] A: 0.63324 | B: 0.61495 | C: 0.60861 [LOGITS Ex2 A] Mean Abs: 2.267 | Max: 6.890 [LOSS Ex2] A: 0.10701 | B: 0.30750 | C: 0.23123 ** [JOINT LOSS] ** : 0.834180 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002383 | Grad Max: 0.069802 -> Layer: shared_layers.0.bias | Grad Mean: 0.170480 | Grad Max: 0.898690 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002269 | Grad Max: 0.006232 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009363 | Grad Max: 0.009363 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001286 | Grad Max: 0.280713 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023143 | Grad Max: 1.543488 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.004776 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006902 | Grad Max: 0.054952 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001238 | Grad Max: 0.004158 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000091 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000289 | Grad Max: 0.001198 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000329 | Grad Max: 0.001095 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003372 | Grad Max: 0.003372 [GRADIENT NORM TOTAL] 4.6450 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.103 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078022 0.49219784] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 721/1327 | B: 607/1249 | C: 404/972 [LOSS Ex1] A: 0.63394 | B: 0.61917 | C: 0.60063 [LOGITS Ex2 A] Mean Abs: 2.248 | Max: 8.094 [LOSS Ex2] A: 0.09871 | B: 0.31121 | C: 0.20130 ** [JOINT LOSS] ** : 0.821657 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002925 | Grad Max: 0.096777 -> Layer: shared_layers.0.bias | Grad Mean: 0.116640 | Grad Max: 0.541411 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002239 | Grad Max: 0.005896 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005454 | Grad Max: 0.005454 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000975 | Grad Max: 0.224079 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016921 | Grad Max: 1.247358 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.003011 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002298 | Grad Max: 0.032238 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000165 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000461 | Grad Max: 0.002775 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000065 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000134 | Grad Max: 0.000648 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000451 | Grad Max: 0.001071 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001682 | Grad Max: 0.001682 [GRADIENT NORM TOTAL] 3.4716 [EPOCH SUMMARY] Train Loss: 0.8395 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8225 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8251 -> New: 0.8225) ############################## EPOCH 147/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.039 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50817853 0.4918214 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 715/1333 | B: 652/1396 | C: 595/1453 [LOSS Ex1] A: 0.63065 | B: 0.61866 | C: 0.61326 [LOGITS Ex2 A] Mean Abs: 2.248 | Max: 7.588 [LOSS Ex2] A: 0.10879 | B: 0.31946 | C: 0.22706 ** [JOINT LOSS] ** : 0.839297 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003008 | Grad Max: 0.082961 -> Layer: shared_layers.0.bias | Grad Mean: 0.211559 | Grad Max: 1.042914 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002214 | Grad Max: 0.006091 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000445 | Grad Max: 0.000445 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001008 | Grad Max: 0.542972 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017602 | Grad Max: 3.022865 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003619 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002984 | Grad Max: 0.030986 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000127 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000514 | Grad Max: 0.002747 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000062 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000144 | Grad Max: 0.000840 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000296 | Grad Max: 0.000969 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002942 | Grad Max: 0.002942 [GRADIENT NORM TOTAL] 5.5294 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.072 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5074944 0.4925056] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 717/1331 | B: 641/1407 | C: 566/1482 [LOSS Ex1] A: 0.62962 | B: 0.61920 | C: 0.61560 [LOGITS Ex2 A] Mean Abs: 2.228 | Max: 7.530 [LOSS Ex2] A: 0.12412 | B: 0.32496 | C: 0.22146 ** [JOINT LOSS] ** : 0.844985 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004102 | Grad Max: 0.115223 -> Layer: shared_layers.0.bias | Grad Mean: 0.163014 | Grad Max: 0.584520 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.006487 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000693 | Grad Max: 0.000693 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000984 | Grad Max: 0.579787 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016503 | Grad Max: 3.211062 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.002719 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003655 | Grad Max: 0.026557 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000191 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000764 | Grad Max: 0.003694 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000106 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000154 | Grad Max: 0.001056 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000297 | Grad Max: 0.000776 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000536 | Grad Max: 0.000536 [GRADIENT NORM TOTAL] 4.7957 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.886 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012984 0.49870154] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.559 | Std: 0.079 [MASKS] A(Pass/Fail): 688/1360 | B: 644/1404 | C: 573/1475 [LOSS Ex1] A: 0.63840 | B: 0.61476 | C: 0.61112 [LOGITS Ex2 A] Mean Abs: 2.206 | Max: 5.896 [LOSS Ex2] A: 0.10996 | B: 0.30103 | C: 0.22501 ** [JOINT LOSS] ** : 0.833422 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003520 | Grad Max: 0.108756 -> Layer: shared_layers.0.bias | Grad Mean: 0.124714 | Grad Max: 0.807372 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.005714 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006826 | Grad Max: 0.006826 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001101 | Grad Max: 0.439941 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018925 | Grad Max: 2.466285 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000097 | Grad Max: 0.003694 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007590 | Grad Max: 0.044428 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001606 | Grad Max: 0.004418 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000102 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000414 | Grad Max: 0.001219 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000361 | Grad Max: 0.001434 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007856 | Grad Max: 0.007856 [GRADIENT NORM TOTAL] 4.2404 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.799 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438232 0.45617682] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.076 [MASKS] A(Pass/Fail): 687/1361 | B: 607/1249 | C: 575/1473 [LOSS Ex1] A: 0.63881 | B: 0.61897 | C: 0.61034 [LOGITS Ex2 A] Mean Abs: 2.201 | Max: 6.203 [LOSS Ex2] A: 0.11910 | B: 0.29827 | C: 0.22526 ** [JOINT LOSS] ** : 0.836915 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003164 | Grad Max: 0.084929 -> Layer: shared_layers.0.bias | Grad Mean: 0.206179 | Grad Max: 1.098241 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.005923 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002767 | Grad Max: 0.002767 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001195 | Grad Max: 0.347669 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021323 | Grad Max: 1.940694 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000092 | Grad Max: 0.003978 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007287 | Grad Max: 0.048631 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000173 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001344 | Grad Max: 0.004489 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000094 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000312 | Grad Max: 0.001023 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000334 | Grad Max: 0.000996 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003836 | Grad Max: 0.003836 [GRADIENT NORM TOTAL] 4.8106 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.997 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.79181033 0.20818964] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.081 [MASKS] A(Pass/Fail): 751/1297 | B: 653/1395 | C: 532/1516 [LOSS Ex1] A: 0.63242 | B: 0.61848 | C: 0.61682 [LOGITS Ex2 A] Mean Abs: 2.256 | Max: 6.608 [LOSS Ex2] A: 0.09665 | B: 0.32122 | C: 0.21944 ** [JOINT LOSS] ** : 0.835009 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001867 | Grad Max: 0.084331 -> Layer: shared_layers.0.bias | Grad Mean: 0.152113 | Grad Max: 0.873201 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.005674 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002758 | Grad Max: 0.002758 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000940 | Grad Max: 0.299713 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016782 | Grad Max: 1.661914 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.004188 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004404 | Grad Max: 0.030741 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000200 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000763 | Grad Max: 0.003813 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000185 | Grad Max: 0.001019 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000359 | Grad Max: 0.001100 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002498 | Grad Max: 0.002498 [GRADIENT NORM TOTAL] 4.1009 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 1.107 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002474 0.49975258] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.079 [MASKS] A(Pass/Fail): 721/1327 | B: 641/1407 | C: 548/1500 [LOSS Ex1] A: 0.63944 | B: 0.61901 | C: 0.61744 [LOGITS Ex2 A] Mean Abs: 2.233 | Max: 6.472 [LOSS Ex2] A: 0.09710 | B: 0.32031 | C: 0.23584 ** [JOINT LOSS] ** : 0.843044 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004136 | Grad Max: 0.110028 -> Layer: shared_layers.0.bias | Grad Mean: 0.249514 | Grad Max: 0.976144 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002021 | Grad Max: 0.005491 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002605 | Grad Max: 0.002605 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001572 | Grad Max: 0.208912 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028477 | Grad Max: 1.173194 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000172 | Grad Max: 0.006285 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014073 | Grad Max: 0.071386 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000286 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002900 | Grad Max: 0.006935 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000168 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000726 | Grad Max: 0.002173 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000557 | Grad Max: 0.001894 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012901 | Grad Max: 0.012901 [GRADIENT NORM TOTAL] 4.9015 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.782 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7253224 0.27467754] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.080 [MASKS] A(Pass/Fail): 714/1334 | B: 645/1403 | C: 583/1465 [LOSS Ex1] A: 0.63481 | B: 0.61456 | C: 0.60884 [LOGITS Ex2 A] Mean Abs: 2.217 | Max: 6.338 [LOSS Ex2] A: 0.12644 | B: 0.30286 | C: 0.22879 ** [JOINT LOSS] ** : 0.838769 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002464 | Grad Max: 0.059432 -> Layer: shared_layers.0.bias | Grad Mean: 0.170229 | Grad Max: 0.842809 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002238 | Grad Max: 0.005489 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003907 | Grad Max: 0.003907 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001307 | Grad Max: 0.412116 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023169 | Grad Max: 2.297638 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000097 | Grad Max: 0.004266 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007850 | Grad Max: 0.044676 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000206 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001533 | Grad Max: 0.004546 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000113 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000369 | Grad Max: 0.001371 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000292 | Grad Max: 0.001020 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005526 | Grad Max: 0.005526 [GRADIENT NORM TOTAL] 4.9415 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.903 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6342683 0.3657317] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.081 [MASKS] A(Pass/Fail): 590/1026 | B: 607/1249 | C: 571/1477 [LOSS Ex1] A: 0.63297 | B: 0.61878 | C: 0.61518 [LOGITS Ex2 A] Mean Abs: 2.274 | Max: 8.314 [LOSS Ex2] A: 0.10363 | B: 0.30870 | C: 0.22585 ** [JOINT LOSS] ** : 0.835033 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.064543 -> Layer: shared_layers.0.bias | Grad Mean: 0.080324 | Grad Max: 0.794805 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.006014 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008843 | Grad Max: 0.008843 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000724 | Grad Max: 0.141688 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012372 | Grad Max: 0.782374 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003090 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003212 | Grad Max: 0.033338 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000140 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000533 | Grad Max: 0.003041 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000069 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000151 | Grad Max: 0.000606 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000301 | Grad Max: 0.001075 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003356 | Grad Max: 0.003356 [GRADIENT NORM TOTAL] 2.5509 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.109 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50788724 0.4921128 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 721/1327 | B: 653/1395 | C: 600/1448 [LOSS Ex1] A: 0.63367 | B: 0.61829 | C: 0.60950 [LOGITS Ex2 A] Mean Abs: 2.251 | Max: 7.598 [LOSS Ex2] A: 0.09918 | B: 0.33106 | C: 0.22870 ** [JOINT LOSS] ** : 0.840129 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003310 | Grad Max: 0.120129 -> Layer: shared_layers.0.bias | Grad Mean: 0.184116 | Grad Max: 0.834594 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002162 | Grad Max: 0.006020 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003056 | Grad Max: 0.003056 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001350 | Grad Max: 0.518991 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023306 | Grad Max: 2.884586 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000106 | Grad Max: 0.005774 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008497 | Grad Max: 0.067952 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000248 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001852 | Grad Max: 0.005309 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000130 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000449 | Grad Max: 0.001411 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000274 | Grad Max: 0.001131 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006083 | Grad Max: 0.006083 [GRADIENT NORM TOTAL] 5.6004 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.046 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080892 0.49191082] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 715/1333 | B: 641/1407 | C: 563/1485 [LOSS Ex1] A: 0.63037 | B: 0.61882 | C: 0.61305 [LOGITS Ex2 A] Mean Abs: 2.254 | Max: 5.518 [LOSS Ex2] A: 0.11513 | B: 0.32133 | C: 0.21372 ** [JOINT LOSS] ** : 0.837469 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003676 | Grad Max: 0.116877 -> Layer: shared_layers.0.bias | Grad Mean: 0.141851 | Grad Max: 0.594346 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002279 | Grad Max: 0.006254 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007061 | Grad Max: 0.007061 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001189 | Grad Max: 0.511066 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020742 | Grad Max: 2.852502 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000087 | Grad Max: 0.004290 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006337 | Grad Max: 0.048857 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000228 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001286 | Grad Max: 0.004143 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000134 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000293 | Grad Max: 0.001359 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000335 | Grad Max: 0.001023 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003120 | Grad Max: 0.003120 [GRADIENT NORM TOTAL] 5.0650 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.078 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.507666 0.49233398] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 718/1330 | B: 645/1403 | C: 559/1489 [LOSS Ex1] A: 0.62933 | B: 0.61436 | C: 0.61057 [LOGITS Ex2 A] Mean Abs: 2.208 | Max: 6.394 [LOSS Ex2] A: 0.12678 | B: 0.31369 | C: 0.21959 ** [JOINT LOSS] ** : 0.838108 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003245 | Grad Max: 0.091487 -> Layer: shared_layers.0.bias | Grad Mean: 0.252514 | Grad Max: 1.216639 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002315 | Grad Max: 0.005943 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002302 | Grad Max: 0.002302 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001729 | Grad Max: 0.271046 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031404 | Grad Max: 1.499895 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000183 | Grad Max: 0.006295 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014982 | Grad Max: 0.089770 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000266 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002858 | Grad Max: 0.007000 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000154 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000698 | Grad Max: 0.001865 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000450 | Grad Max: 0.001486 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010989 | Grad Max: 0.010989 [GRADIENT NORM TOTAL] 5.9057 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.892 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50133747 0.4986625 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.079 [MASKS] A(Pass/Fail): 687/1361 | B: 607/1249 | C: 570/1478 [LOSS Ex1] A: 0.63812 | B: 0.61859 | C: 0.61263 [LOGITS Ex2 A] Mean Abs: 2.206 | Max: 6.694 [LOSS Ex2] A: 0.10659 | B: 0.30633 | C: 0.22652 ** [JOINT LOSS] ** : 0.836260 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002324 | Grad Max: 0.055613 -> Layer: shared_layers.0.bias | Grad Mean: 0.138130 | Grad Max: 0.574237 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005471 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001866 | Grad Max: 0.001866 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000977 | Grad Max: 0.208628 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017073 | Grad Max: 1.199041 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000080 | Grad Max: 0.003440 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006079 | Grad Max: 0.037982 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000188 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001293 | Grad Max: 0.004139 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000109 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000318 | Grad Max: 0.001171 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000353 | Grad Max: 0.001313 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005552 | Grad Max: 0.005552 [GRADIENT NORM TOTAL] 3.5304 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.130 | Max: 0.803 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5437758 0.4562242] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.558 | Std: 0.076 [MASKS] A(Pass/Fail): 687/1361 | B: 653/1395 | C: 585/1463 [LOSS Ex1] A: 0.63855 | B: 0.61810 | C: 0.61642 [LOGITS Ex2 A] Mean Abs: 2.248 | Max: 6.164 [LOSS Ex2] A: 0.11796 | B: 0.32797 | C: 0.22603 ** [JOINT LOSS] ** : 0.848343 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004763 | Grad Max: 0.231262 -> Layer: shared_layers.0.bias | Grad Mean: 0.534115 | Grad Max: 2.877620 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002050 | Grad Max: 0.005747 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005704 | Grad Max: 0.005704 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003347 | Grad Max: 0.693671 -> Layer: exit2_layers.0.bias | Grad Mean: 0.062418 | Grad Max: 3.864939 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.012818 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030748 | Grad Max: 0.172012 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000537 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005833 | Grad Max: 0.012789 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000281 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001424 | Grad Max: 0.004067 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000807 | Grad Max: 0.001948 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021649 | Grad Max: 0.021649 [GRADIENT NORM TOTAL] 12.3832 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.003 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7931068 0.20689319] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.081 [MASKS] A(Pass/Fail): 749/1299 | B: 641/1407 | C: 384/992 [LOSS Ex1] A: 0.63215 | B: 0.61862 | C: 0.61358 [LOGITS Ex2 A] Mean Abs: 2.283 | Max: 6.606 [LOSS Ex2] A: 0.10986 | B: 0.32373 | C: 0.23260 ** [JOINT LOSS] ** : 0.843516 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004327 | Grad Max: 0.243253 -> Layer: shared_layers.0.bias | Grad Mean: 0.516600 | Grad Max: 3.005739 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.006216 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001323 | Grad Max: 0.001323 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003344 | Grad Max: 0.731513 -> Layer: exit2_layers.0.bias | Grad Mean: 0.062033 | Grad Max: 4.084873 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000338 | Grad Max: 0.012544 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028981 | Grad Max: 0.156985 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000427 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005616 | Grad Max: 0.011615 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000305 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001385 | Grad Max: 0.004318 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000726 | Grad Max: 0.002055 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020609 | Grad Max: 0.020609 [GRADIENT NORM TOTAL] 12.5890 [EPOCH SUMMARY] Train Loss: 0.8393 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8219 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8225 -> New: 0.8219) ############################## EPOCH 148/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.114 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500153 0.49984697] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 721/1327 | B: 645/1403 | C: 579/1469 [LOSS Ex1] A: 0.63918 | B: 0.61417 | C: 0.61109 [LOGITS Ex2 A] Mean Abs: 2.257 | Max: 5.892 [LOSS Ex2] A: 0.10344 | B: 0.30445 | C: 0.24359 ** [JOINT LOSS] ** : 0.838642 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004052 | Grad Max: 0.168747 -> Layer: shared_layers.0.bias | Grad Mean: 0.174264 | Grad Max: 1.119914 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.005304 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000617 | Grad Max: 0.000617 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001251 | Grad Max: 0.386976 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020970 | Grad Max: 2.153755 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.003843 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003787 | Grad Max: 0.036609 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000162 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000418 | Grad Max: 0.002568 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000061 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000107 | Grad Max: 0.000759 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000304 | Grad Max: 0.000907 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000113 | Grad Max: 0.000113 [GRADIENT NORM TOTAL] 4.8246 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.786 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.72618556 0.2738145 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.081 [MASKS] A(Pass/Fail): 714/1334 | B: 607/1249 | C: 583/1465 [LOSS Ex1] A: 0.63454 | B: 0.61840 | C: 0.61031 [LOGITS Ex2 A] Mean Abs: 2.193 | Max: 7.155 [LOSS Ex2] A: 0.12730 | B: 0.32562 | C: 0.21426 ** [JOINT LOSS] ** : 0.843476 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008432 | Grad Max: 0.240733 -> Layer: shared_layers.0.bias | Grad Mean: 0.661299 | Grad Max: 3.080513 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.006352 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000252 | Grad Max: 0.000252 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004323 | Grad Max: 0.640907 -> Layer: exit2_layers.0.bias | Grad Mean: 0.079612 | Grad Max: 3.617027 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000513 | Grad Max: 0.016316 -> Layer: exit2_layers.3.bias | Grad Mean: 0.043046 | Grad Max: 0.224571 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000676 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008566 | Grad Max: 0.017744 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000452 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002029 | Grad Max: 0.006424 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001206 | Grad Max: 0.002956 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030645 | Grad Max: 0.030645 [GRADIENT NORM TOTAL] 14.3054 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.909 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6346236 0.36537638] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.081 [MASKS] A(Pass/Fail): 590/1026 | B: 653/1395 | C: 571/1477 [LOSS Ex1] A: 0.63269 | B: 0.61792 | C: 0.60867 [LOGITS Ex2 A] Mean Abs: 2.258 | Max: 7.502 [LOSS Ex2] A: 0.10831 | B: 0.34133 | C: 0.20809 ** [JOINT LOSS] ** : 0.839001 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005432 | Grad Max: 0.244929 -> Layer: shared_layers.0.bias | Grad Mean: 0.659577 | Grad Max: 3.512191 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002213 | Grad Max: 0.006131 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000255 | Grad Max: 0.000255 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004106 | Grad Max: 0.718885 -> Layer: exit2_layers.0.bias | Grad Mean: 0.076436 | Grad Max: 4.047704 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000467 | Grad Max: 0.015796 -> Layer: exit2_layers.3.bias | Grad Mean: 0.040053 | Grad Max: 0.216301 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000586 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007717 | Grad Max: 0.016154 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000377 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001849 | Grad Max: 0.005271 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001065 | Grad Max: 0.002593 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028235 | Grad Max: 0.028235 [GRADIENT NORM TOTAL] 14.8204 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.116 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50794154 0.49205846] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 721/1327 | B: 642/1406 | C: 570/1478 [LOSS Ex1] A: 0.63342 | B: 0.61845 | C: 0.61243 [LOGITS Ex2 A] Mean Abs: 2.284 | Max: 7.032 [LOSS Ex2] A: 0.09927 | B: 0.31766 | C: 0.21244 ** [JOINT LOSS] ** : 0.831226 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003523 | Grad Max: 0.124000 -> Layer: shared_layers.0.bias | Grad Mean: 0.163090 | Grad Max: 0.772394 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.005494 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002989 | Grad Max: 0.002989 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001189 | Grad Max: 0.446076 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021251 | Grad Max: 2.445360 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.003807 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006220 | Grad Max: 0.047666 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000213 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001369 | Grad Max: 0.004696 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000098 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000352 | Grad Max: 0.001257 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000445 | Grad Max: 0.001457 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005259 | Grad Max: 0.005259 [GRADIENT NORM TOTAL] 5.0581 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.052 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50798213 0.49201784] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 715/1333 | B: 645/1403 | C: 610/1438 [LOSS Ex1] A: 0.63012 | B: 0.61400 | C: 0.60973 [LOGITS Ex2 A] Mean Abs: 2.290 | Max: 6.265 [LOSS Ex2] A: 0.11433 | B: 0.30517 | C: 0.23490 ** [JOINT LOSS] ** : 0.836084 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008644 | Grad Max: 0.230924 -> Layer: shared_layers.0.bias | Grad Mean: 0.400923 | Grad Max: 1.441780 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002273 | Grad Max: 0.006117 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000957 | Grad Max: 0.000957 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002883 | Grad Max: 0.399275 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052360 | Grad Max: 2.217321 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.010521 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027211 | Grad Max: 0.137563 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000471 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005610 | Grad Max: 0.011327 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000273 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001392 | Grad Max: 0.004090 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000799 | Grad Max: 0.002216 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021195 | Grad Max: 0.021195 [GRADIENT NORM TOTAL] 8.5383 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.085 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50777346 0.49222654] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 719/1329 | B: 607/1249 | C: 574/1474 [LOSS Ex1] A: 0.62909 | B: 0.61823 | C: 0.61421 [LOGITS Ex2 A] Mean Abs: 2.261 | Max: 7.909 [LOSS Ex2] A: 0.12558 | B: 0.31199 | C: 0.22313 ** [JOINT LOSS] ** : 0.840746 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006061 | Grad Max: 0.225433 -> Layer: shared_layers.0.bias | Grad Mean: 0.131007 | Grad Max: 0.786776 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.006399 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000697 | Grad Max: 0.000697 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001250 | Grad Max: 0.253836 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020069 | Grad Max: 1.402760 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.003144 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003400 | Grad Max: 0.023018 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000246 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000887 | Grad Max: 0.003468 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000120 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000237 | Grad Max: 0.001377 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000276 | Grad Max: 0.000997 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003732 | Grad Max: 0.003732 [GRADIENT NORM TOTAL] 3.7512 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.897 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012983 0.49870163] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.080 [MASKS] A(Pass/Fail): 688/1360 | B: 653/1395 | C: 569/1479 [LOSS Ex1] A: 0.63790 | B: 0.61776 | C: 0.61018 [LOGITS Ex2 A] Mean Abs: 2.208 | Max: 6.763 [LOSS Ex2] A: 0.10350 | B: 0.33620 | C: 0.22778 ** [JOINT LOSS] ** : 0.844443 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006795 | Grad Max: 0.197257 -> Layer: shared_layers.0.bias | Grad Mean: 0.528215 | Grad Max: 2.278954 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002083 | Grad Max: 0.006200 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009680 | Grad Max: 0.009680 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003488 | Grad Max: 0.787651 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064619 | Grad Max: 4.374193 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000419 | Grad Max: 0.012362 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035340 | Grad Max: 0.166244 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000544 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007025 | Grad Max: 0.013838 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000360 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001723 | Grad Max: 0.005168 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001063 | Grad Max: 0.002746 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027589 | Grad Max: 0.027589 [GRADIENT NORM TOTAL] 11.8342 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.808 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5438248 0.4561752] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.077 [MASKS] A(Pass/Fail): 687/1361 | B: 641/1407 | C: 574/1474 [LOSS Ex1] A: 0.63835 | B: 0.61829 | C: 0.61224 [LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.424 [LOSS Ex2] A: 0.11898 | B: 0.32295 | C: 0.20887 ** [JOINT LOSS] ** : 0.839895 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007880 | Grad Max: 0.289802 -> Layer: shared_layers.0.bias | Grad Mean: 0.250060 | Grad Max: 0.868961 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.005675 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002466 | Grad Max: 0.002466 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001756 | Grad Max: 0.697389 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030453 | Grad Max: 3.874901 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000188 | Grad Max: 0.005799 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014597 | Grad Max: 0.073580 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000383 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003180 | Grad Max: 0.007879 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000164 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000763 | Grad Max: 0.001990 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000509 | Grad Max: 0.001784 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011839 | Grad Max: 0.011839 [GRADIENT NORM TOTAL] 6.5490 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.008 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.79429924 0.20570076] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.081 [MASKS] A(Pass/Fail): 753/1295 | B: 645/1403 | C: 547/1501 [LOSS Ex1] A: 0.63194 | B: 0.61384 | C: 0.61745 [LOGITS Ex2 A] Mean Abs: 2.252 | Max: 6.879 [LOSS Ex2] A: 0.10910 | B: 0.31047 | C: 0.24161 ** [JOINT LOSS] ** : 0.841467 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004807 | Grad Max: 0.273387 -> Layer: shared_layers.0.bias | Grad Mean: 0.567028 | Grad Max: 3.457232 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.005595 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000017 | Grad Max: 0.000017 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003733 | Grad Max: 0.588589 -> Layer: exit2_layers.0.bias | Grad Mean: 0.067983 | Grad Max: 3.251826 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.016065 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033337 | Grad Max: 0.199080 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000481 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006283 | Grad Max: 0.013507 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000287 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001542 | Grad Max: 0.004290 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000790 | Grad Max: 0.002069 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023389 | Grad Max: 0.023389 [GRADIENT NORM TOTAL] 13.3339 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.119 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50017214 0.49982783] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 721/1327 | B: 607/1249 | C: 549/1499 [LOSS Ex1] A: 0.63899 | B: 0.61806 | C: 0.61892 [LOGITS Ex2 A] Mean Abs: 2.280 | Max: 6.107 [LOSS Ex2] A: 0.09855 | B: 0.31590 | C: 0.23217 ** [JOINT LOSS] ** : 0.840865 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006169 | Grad Max: 0.239992 -> Layer: shared_layers.0.bias | Grad Mean: 0.522170 | Grad Max: 3.158928 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002016 | Grad Max: 0.005525 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000425 | Grad Max: 0.000425 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003575 | Grad Max: 0.639206 -> Layer: exit2_layers.0.bias | Grad Mean: 0.063727 | Grad Max: 3.510342 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000372 | Grad Max: 0.013597 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031040 | Grad Max: 0.177144 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000410 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005592 | Grad Max: 0.011563 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000267 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001370 | Grad Max: 0.003988 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000669 | Grad Max: 0.001893 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020651 | Grad Max: 0.020651 [GRADIENT NORM TOTAL] 12.2430 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.791 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.72711104 0.27288896] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.081 [MASKS] A(Pass/Fail): 714/1334 | B: 653/1395 | C: 585/1463 [LOSS Ex1] A: 0.63434 | B: 0.61761 | C: 0.61077 [LOGITS Ex2 A] Mean Abs: 2.238 | Max: 6.788 [LOSS Ex2] A: 0.12692 | B: 0.32075 | C: 0.20830 ** [JOINT LOSS] ** : 0.839566 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006425 | Grad Max: 0.277786 -> Layer: shared_layers.0.bias | Grad Mean: 0.096087 | Grad Max: 0.615917 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.006055 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000733 | Grad Max: 0.000733 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001038 | Grad Max: 0.130181 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015506 | Grad Max: 0.610116 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000081 | Grad Max: 0.003876 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004036 | Grad Max: 0.029042 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000228 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000980 | Grad Max: 0.003995 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000092 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000238 | Grad Max: 0.001005 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000304 | Grad Max: 0.001109 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003603 | Grad Max: 0.003603 [GRADIENT NORM TOTAL] 2.6854 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.913 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6350478 0.3649522] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.081 [MASKS] A(Pass/Fail): 590/1026 | B: 642/1406 | C: 565/1483 [LOSS Ex1] A: 0.63249 | B: 0.61814 | C: 0.61178 [LOGITS Ex2 A] Mean Abs: 2.249 | Max: 7.915 [LOSS Ex2] A: 0.11599 | B: 0.32487 | C: 0.23232 ** [JOINT LOSS] ** : 0.845200 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003401 | Grad Max: 0.096774 -> Layer: shared_layers.0.bias | Grad Mean: 0.295513 | Grad Max: 1.116887 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006511 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011236 | Grad Max: 0.011236 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001744 | Grad Max: 0.389030 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032238 | Grad Max: 2.178115 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000189 | Grad Max: 0.009010 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015994 | Grad Max: 0.103976 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000315 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003110 | Grad Max: 0.007067 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000177 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000794 | Grad Max: 0.002178 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000562 | Grad Max: 0.001657 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014497 | Grad Max: 0.014497 [GRADIENT NORM TOTAL] 6.1930 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.121 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50791514 0.49208483] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.082 [MASKS] A(Pass/Fail): 721/1327 | B: 645/1403 | C: 559/1489 [LOSS Ex1] A: 0.63322 | B: 0.61370 | C: 0.61431 [LOGITS Ex2 A] Mean Abs: 2.266 | Max: 7.681 [LOSS Ex2] A: 0.11250 | B: 0.30226 | C: 0.24155 ** [JOINT LOSS] ** : 0.839176 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008813 | Grad Max: 0.400017 -> Layer: shared_layers.0.bias | Grad Mean: 0.201370 | Grad Max: 0.859841 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.005517 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004762 | Grad Max: 0.004762 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001738 | Grad Max: 0.394614 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028606 | Grad Max: 1.980252 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000159 | Grad Max: 0.005886 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011916 | Grad Max: 0.065343 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000404 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002940 | Grad Max: 0.006744 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000202 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000720 | Grad Max: 0.002004 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000486 | Grad Max: 0.001631 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011036 | Grad Max: 0.011036 [GRADIENT NORM TOTAL] 5.4021 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.057 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5079038 0.4920962] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 715/1333 | B: 607/1249 | C: 396/980 [LOSS Ex1] A: 0.62991 | B: 0.61791 | C: 0.61372 [LOGITS Ex2 A] Mean Abs: 2.262 | Max: 6.255 [LOSS Ex2] A: 0.11775 | B: 0.30376 | C: 0.23262 ** [JOINT LOSS] ** : 0.838559 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010056 | Grad Max: 0.413016 -> Layer: shared_layers.0.bias | Grad Mean: 0.185530 | Grad Max: 0.893856 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.005900 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002463 | Grad Max: 0.002463 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001512 | Grad Max: 0.364667 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022756 | Grad Max: 1.785062 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.005387 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005737 | Grad Max: 0.039202 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000306 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001545 | Grad Max: 0.004837 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000137 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000344 | Grad Max: 0.001438 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000268 | Grad Max: 0.001313 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004194 | Grad Max: 0.004194 [GRADIENT NORM TOTAL] 4.6611 [EPOCH SUMMARY] Train Loss: 0.8399 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8201 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8219 -> New: 0.8201) ############################## EPOCH 149/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.090 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5078609 0.49213904] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 719/1329 | B: 653/1395 | C: 586/1462 [LOSS Ex1] A: 0.62888 | B: 0.61747 | C: 0.61554 [LOGITS Ex2 A] Mean Abs: 2.186 | Max: 6.160 [LOSS Ex2] A: 0.12498 | B: 0.32958 | C: 0.23491 ** [JOINT LOSS] ** : 0.850455 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007338 | Grad Max: 0.295747 -> Layer: shared_layers.0.bias | Grad Mean: 0.277528 | Grad Max: 1.423817 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.006186 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003496 | Grad Max: 0.003496 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.283897 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035058 | Grad Max: 1.556970 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000201 | Grad Max: 0.008805 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014898 | Grad Max: 0.108191 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000242 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002392 | Grad Max: 0.005436 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000134 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000633 | Grad Max: 0.001885 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000345 | Grad Max: 0.001270 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011323 | Grad Max: 0.011323 [GRADIENT NORM TOTAL] 6.4258 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.901 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013038 0.49869618] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.080 [MASKS] A(Pass/Fail): 687/1361 | B: 642/1406 | C: 557/1491 [LOSS Ex1] A: 0.63770 | B: 0.61799 | C: 0.61358 [LOGITS Ex2 A] Mean Abs: 2.206 | Max: 6.135 [LOSS Ex2] A: 0.10240 | B: 0.32253 | C: 0.21857 ** [JOINT LOSS] ** : 0.837590 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002653 | Grad Max: 0.095454 -> Layer: shared_layers.0.bias | Grad Mean: 0.148549 | Grad Max: 1.185788 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005733 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008016 | Grad Max: 0.008016 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000873 | Grad Max: 0.328486 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015312 | Grad Max: 1.831441 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.002748 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002219 | Grad Max: 0.021813 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000127 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000374 | Grad Max: 0.002637 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000112 | Grad Max: 0.001085 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000406 | Grad Max: 0.001214 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000515 | Grad Max: 0.000515 [GRADIENT NORM TOTAL] 3.9925 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.811 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54374146 0.45625857] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.077 [MASKS] A(Pass/Fail): 688/1360 | B: 645/1403 | C: 573/1475 [LOSS Ex1] A: 0.63815 | B: 0.61354 | C: 0.60960 [LOGITS Ex2 A] Mean Abs: 2.204 | Max: 6.308 [LOSS Ex2] A: 0.11379 | B: 0.29815 | C: 0.22330 ** [JOINT LOSS] ** : 0.832178 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005911 | Grad Max: 0.215428 -> Layer: shared_layers.0.bias | Grad Mean: 0.338015 | Grad Max: 2.176780 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.005780 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007644 | Grad Max: 0.007644 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002026 | Grad Max: 0.471597 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034735 | Grad Max: 2.614603 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000151 | Grad Max: 0.006400 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011064 | Grad Max: 0.079390 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000250 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001712 | Grad Max: 0.004549 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000095 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000436 | Grad Max: 0.001277 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000369 | Grad Max: 0.001027 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006153 | Grad Max: 0.006153 [GRADIENT NORM TOTAL] 7.5870 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.013 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7952576 0.20474239] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.081 [MASKS] A(Pass/Fail): 753/1295 | B: 607/1249 | C: 578/1470 [LOSS Ex1] A: 0.63172 | B: 0.61776 | C: 0.61453 [LOGITS Ex2 A] Mean Abs: 2.238 | Max: 6.937 [LOSS Ex2] A: 0.11064 | B: 0.30467 | C: 0.23354 ** [JOINT LOSS] ** : 0.837624 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006133 | Grad Max: 0.228967 -> Layer: shared_layers.0.bias | Grad Mean: 0.227007 | Grad Max: 1.429535 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.006164 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008347 | Grad Max: 0.008347 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001544 | Grad Max: 0.398806 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025809 | Grad Max: 2.187736 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.005565 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005762 | Grad Max: 0.066694 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000194 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000632 | Grad Max: 0.003767 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000068 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000177 | Grad Max: 0.000958 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000318 | Grad Max: 0.000816 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002180 | Grad Max: 0.002180 [GRADIENT NORM TOTAL] 5.2652 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 1.125 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50007623 0.4999238 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 721/1327 | B: 654/1394 | C: 574/1474 [LOSS Ex1] A: 0.63878 | B: 0.61732 | C: 0.61267 [LOGITS Ex2 A] Mean Abs: 2.215 | Max: 6.600 [LOSS Ex2] A: 0.11256 | B: 0.33534 | C: 0.21128 ** [JOINT LOSS] ** : 0.842648 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010415 | Grad Max: 0.375245 -> Layer: shared_layers.0.bias | Grad Mean: 0.440335 | Grad Max: 2.069128 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002060 | Grad Max: 0.005830 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001654 | Grad Max: 0.001654 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002923 | Grad Max: 0.731435 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051470 | Grad Max: 4.074980 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000294 | Grad Max: 0.008467 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023786 | Grad Max: 0.113171 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000573 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005263 | Grad Max: 0.011656 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000302 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001253 | Grad Max: 0.003795 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000821 | Grad Max: 0.002311 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019501 | Grad Max: 0.019501 [GRADIENT NORM TOTAL] 10.2976 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.794 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.727703 0.27229697] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.081 [MASKS] A(Pass/Fail): 714/1334 | B: 644/1404 | C: 593/1455 [LOSS Ex1] A: 0.63411 | B: 0.61784 | C: 0.60555 [LOGITS Ex2 A] Mean Abs: 2.220 | Max: 6.718 [LOSS Ex2] A: 0.12190 | B: 0.32468 | C: 0.20890 ** [JOINT LOSS] ** : 0.837659 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007396 | Grad Max: 0.240756 -> Layer: shared_layers.0.bias | Grad Mean: 0.366453 | Grad Max: 1.773302 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.006574 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011153 | Grad Max: 0.011153 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002019 | Grad Max: 0.638623 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035634 | Grad Max: 3.567219 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000192 | Grad Max: 0.006111 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015368 | Grad Max: 0.071504 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000343 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003356 | Grad Max: 0.008302 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000176 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000803 | Grad Max: 0.002415 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000570 | Grad Max: 0.001915 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013291 | Grad Max: 0.013291 [GRADIENT NORM TOTAL] 8.5848 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.918 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6353262 0.3646738] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.081 [MASKS] A(Pass/Fail): 590/1026 | B: 646/1402 | C: 577/1471 [LOSS Ex1] A: 0.63226 | B: 0.61339 | C: 0.61548 [LOGITS Ex2 A] Mean Abs: 2.290 | Max: 7.722 [LOSS Ex2] A: 0.10801 | B: 0.30151 | C: 0.23413 ** [JOINT LOSS] ** : 0.834929 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003013 | Grad Max: 0.105536 -> Layer: shared_layers.0.bias | Grad Mean: 0.293278 | Grad Max: 1.318778 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005836 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005311 | Grad Max: 0.005311 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002257 | Grad Max: 0.485572 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041330 | Grad Max: 2.703089 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000218 | Grad Max: 0.007507 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018481 | Grad Max: 0.096666 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000323 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003430 | Grad Max: 0.007728 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000179 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000838 | Grad Max: 0.002708 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000420 | Grad Max: 0.001610 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011734 | Grad Max: 0.011734 [GRADIENT NORM TOTAL] 7.8017 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.127 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080019 0.49199808] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.082 [MASKS] A(Pass/Fail): 721/1327 | B: 608/1248 | C: 559/1489 [LOSS Ex1] A: 0.63299 | B: 0.61761 | C: 0.61260 [LOGITS Ex2 A] Mean Abs: 2.282 | Max: 7.408 [LOSS Ex2] A: 0.09860 | B: 0.30142 | C: 0.22150 ** [JOINT LOSS] ** : 0.828243 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005405 | Grad Max: 0.211060 -> Layer: shared_layers.0.bias | Grad Mean: 0.302336 | Grad Max: 1.280491 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002244 | Grad Max: 0.005827 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007650 | Grad Max: 0.007650 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002280 | Grad Max: 0.278072 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041069 | Grad Max: 1.517658 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000242 | Grad Max: 0.008995 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020167 | Grad Max: 0.109075 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000316 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004226 | Grad Max: 0.008694 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001106 | Grad Max: 0.002969 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000601 | Grad Max: 0.002133 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017214 | Grad Max: 0.017214 [GRADIENT NORM TOTAL] 6.9460 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.062 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50780654 0.4921934 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.082 [MASKS] A(Pass/Fail): 715/1333 | B: 654/1394 | C: 595/1453 [LOSS Ex1] A: 0.62968 | B: 0.61718 | C: 0.61210 [LOGITS Ex2 A] Mean Abs: 2.246 | Max: 5.305 [LOSS Ex2] A: 0.10564 | B: 0.33649 | C: 0.23694 ** [JOINT LOSS] ** : 0.846012 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004191 | Grad Max: 0.133986 -> Layer: shared_layers.0.bias | Grad Mean: 0.347512 | Grad Max: 1.843104 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.006589 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005553 | Grad Max: 0.005553 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002241 | Grad Max: 0.260218 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040158 | Grad Max: 1.446332 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000194 | Grad Max: 0.008099 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016475 | Grad Max: 0.097674 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000254 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002928 | Grad Max: 0.006956 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000163 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000697 | Grad Max: 0.002249 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000346 | Grad Max: 0.001155 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010537 | Grad Max: 0.010537 [GRADIENT NORM TOTAL] 7.6014 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.095 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080095 0.4919905] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 718/1330 | B: 644/1404 | C: 602/1446 [LOSS Ex1] A: 0.62865 | B: 0.61770 | C: 0.60844 [LOGITS Ex2 A] Mean Abs: 2.200 | Max: 7.684 [LOSS Ex2] A: 0.12119 | B: 0.32697 | C: 0.23517 ** [JOINT LOSS] ** : 0.846045 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004874 | Grad Max: 0.174399 -> Layer: shared_layers.0.bias | Grad Mean: 0.362096 | Grad Max: 1.856905 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002221 | Grad Max: 0.006139 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006520 | Grad Max: 0.006520 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002495 | Grad Max: 0.277167 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044080 | Grad Max: 1.527525 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000250 | Grad Max: 0.010258 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020795 | Grad Max: 0.144833 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000291 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003718 | Grad Max: 0.008629 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000170 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000936 | Grad Max: 0.002615 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000535 | Grad Max: 0.001606 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015671 | Grad Max: 0.015671 [GRADIENT NORM TOTAL] 7.9223 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.906 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013242 0.49867585] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.080 [MASKS] A(Pass/Fail): 687/1361 | B: 646/1402 | C: 549/1499 [LOSS Ex1] A: 0.63748 | B: 0.61325 | C: 0.61817 [LOGITS Ex2 A] Mean Abs: 2.204 | Max: 5.896 [LOSS Ex2] A: 0.09952 | B: 0.30116 | C: 0.24125 ** [JOINT LOSS] ** : 0.836944 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004379 | Grad Max: 0.105056 -> Layer: shared_layers.0.bias | Grad Mean: 0.269539 | Grad Max: 1.245909 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.005232 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003668 | Grad Max: 0.003668 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001611 | Grad Max: 0.448073 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029059 | Grad Max: 2.505992 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.004579 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011617 | Grad Max: 0.055547 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000229 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002384 | Grad Max: 0.006414 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000166 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000592 | Grad Max: 0.002168 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000408 | Grad Max: 0.001480 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008993 | Grad Max: 0.008993 [GRADIENT NORM TOTAL] 6.4298 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.131 | Max: 0.815 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54365367 0.4563463 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.077 [MASKS] A(Pass/Fail): 688/1360 | B: 608/1248 | C: 586/1462 [LOSS Ex1] A: 0.63794 | B: 0.61746 | C: 0.60824 [LOGITS Ex2 A] Mean Abs: 2.200 | Max: 6.236 [LOSS Ex2] A: 0.11612 | B: 0.29976 | C: 0.19290 ** [JOINT LOSS] ** : 0.824143 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003182 | Grad Max: 0.138788 -> Layer: shared_layers.0.bias | Grad Mean: 0.235455 | Grad Max: 1.526980 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002120 | Grad Max: 0.005806 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006235 | Grad Max: 0.006235 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001409 | Grad Max: 0.357189 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025390 | Grad Max: 1.988703 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000105 | Grad Max: 0.005967 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008717 | Grad Max: 0.056744 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000213 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001865 | Grad Max: 0.005304 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000114 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000508 | Grad Max: 0.001623 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000455 | Grad Max: 0.001480 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008352 | Grad Max: 0.008352 [GRADIENT NORM TOTAL] 5.6023 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.017 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7962062 0.20379382] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.082 [MASKS] A(Pass/Fail): 748/1300 | B: 655/1393 | C: 570/1478 [LOSS Ex1] A: 0.63152 | B: 0.61704 | C: 0.60909 [LOGITS Ex2 A] Mean Abs: 2.233 | Max: 6.111 [LOSS Ex2] A: 0.10174 | B: 0.32398 | C: 0.21004 ** [JOINT LOSS] ** : 0.831135 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.056891 -> Layer: shared_layers.0.bias | Grad Mean: 0.125098 | Grad Max: 0.685857 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.006157 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002652 | Grad Max: 0.002652 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001045 | Grad Max: 0.148672 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018952 | Grad Max: 0.831788 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000080 | Grad Max: 0.004002 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006536 | Grad Max: 0.038221 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000178 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001283 | Grad Max: 0.004097 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000102 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000300 | Grad Max: 0.000999 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000364 | Grad Max: 0.001265 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005219 | Grad Max: 0.005219 [GRADIENT NORM TOTAL] 3.4499 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.130 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000562 0.49994382] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.080 [MASKS] A(Pass/Fail): 721/1327 | B: 644/1404 | C: 357/1019 [LOSS Ex1] A: 0.63859 | B: 0.61755 | C: 0.61275 [LOGITS Ex2 A] Mean Abs: 2.269 | Max: 6.778 [LOSS Ex2] A: 0.09475 | B: 0.32263 | C: 0.20967 ** [JOINT LOSS] ** : 0.831978 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004249 | Grad Max: 0.150482 -> Layer: shared_layers.0.bias | Grad Mean: 0.078697 | Grad Max: 0.600354 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002075 | Grad Max: 0.005434 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002130 | Grad Max: 0.002130 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000780 | Grad Max: 0.105018 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012371 | Grad Max: 0.585196 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.002597 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002568 | Grad Max: 0.024560 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000152 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000361 | Grad Max: 0.002350 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000090 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000097 | Grad Max: 0.000639 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000407 | Grad Max: 0.001334 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001492 | Grad Max: 0.001492 [GRADIENT NORM TOTAL] 2.2450 [EPOCH SUMMARY] Train Loss: 0.8370 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8174 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8201 -> New: 0.8174) ############################## EPOCH 150/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.798 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7283624 0.2716376] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.081 [MASKS] A(Pass/Fail): 714/1334 | B: 646/1402 | C: 572/1476 [LOSS Ex1] A: 0.63390 | B: 0.61309 | C: 0.61172 [LOGITS Ex2 A] Mean Abs: 2.237 | Max: 6.820 [LOSS Ex2] A: 0.12262 | B: 0.30470 | C: 0.22358 ** [JOINT LOSS] ** : 0.836537 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003021 | Grad Max: 0.062888 -> Layer: shared_layers.0.bias | Grad Mean: 0.152465 | Grad Max: 0.605028 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.005929 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000624 | Grad Max: 0.000624 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000988 | Grad Max: 0.456241 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016892 | Grad Max: 2.566595 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.002698 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002638 | Grad Max: 0.025500 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000125 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000430 | Grad Max: 0.003058 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000084 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000101 | Grad Max: 0.000632 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000306 | Grad Max: 0.000994 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000019 | Grad Max: 0.000019 [GRADIENT NORM TOTAL] 4.8113 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.922 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6355784 0.36442164] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.082 [MASKS] A(Pass/Fail): 590/1026 | B: 608/1248 | C: 585/1463 [LOSS Ex1] A: 0.63205 | B: 0.61729 | C: 0.60588 [LOGITS Ex2 A] Mean Abs: 2.309 | Max: 8.461 [LOSS Ex2] A: 0.09905 | B: 0.29895 | C: 0.20574 ** [JOINT LOSS] ** : 0.819651 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002592 | Grad Max: 0.062312 -> Layer: shared_layers.0.bias | Grad Mean: 0.177754 | Grad Max: 0.697123 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002170 | Grad Max: 0.006090 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006580 | Grad Max: 0.006580 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001141 | Grad Max: 0.549817 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020275 | Grad Max: 3.059267 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.005483 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006416 | Grad Max: 0.057632 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000200 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001199 | Grad Max: 0.004209 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000083 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000270 | Grad Max: 0.001148 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000402 | Grad Max: 0.001257 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003562 | Grad Max: 0.003562 [GRADIENT NORM TOTAL] 5.1088 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.132 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080231 0.49197695] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.082 [MASKS] A(Pass/Fail): 721/1327 | B: 655/1393 | C: 588/1460 [LOSS Ex1] A: 0.63278 | B: 0.61687 | C: 0.61222 [LOGITS Ex2 A] Mean Abs: 2.287 | Max: 8.585 [LOSS Ex2] A: 0.09971 | B: 0.31983 | C: 0.21800 ** [JOINT LOSS] ** : 0.833133 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003319 | Grad Max: 0.092843 -> Layer: shared_layers.0.bias | Grad Mean: 0.140289 | Grad Max: 0.827102 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.005909 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003560 | Grad Max: 0.003560 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001002 | Grad Max: 0.253330 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017482 | Grad Max: 1.422133 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000098 | Grad Max: 0.004137 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008207 | Grad Max: 0.044274 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000207 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001716 | Grad Max: 0.004402 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000121 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000406 | Grad Max: 0.001252 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000358 | Grad Max: 0.001054 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005244 | Grad Max: 0.005244 [GRADIENT NORM TOTAL] 3.4733 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.067 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50764364 0.49235636] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.082 [MASKS] A(Pass/Fail): 715/1333 | B: 644/1404 | C: 563/1485 [LOSS Ex1] A: 0.62947 | B: 0.61737 | C: 0.61344 [LOGITS Ex2 A] Mean Abs: 2.284 | Max: 6.556 [LOSS Ex2] A: 0.10988 | B: 0.31730 | C: 0.23122 ** [JOINT LOSS] ** : 0.839558 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003735 | Grad Max: 0.124624 -> Layer: shared_layers.0.bias | Grad Mean: 0.134374 | Grad Max: 0.592684 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.006352 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004454 | Grad Max: 0.004454 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001107 | Grad Max: 0.193662 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019168 | Grad Max: 1.074257 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.005241 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007117 | Grad Max: 0.056039 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000144 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001337 | Grad Max: 0.003774 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000366 | Grad Max: 0.001407 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000288 | Grad Max: 0.001242 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007408 | Grad Max: 0.007408 [GRADIENT NORM TOTAL] 3.5693 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50814277 0.49185726] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 717/1331 | B: 646/1402 | C: 563/1485 [LOSS Ex1] A: 0.62843 | B: 0.61292 | C: 0.61443 [LOGITS Ex2 A] Mean Abs: 2.266 | Max: 6.942 [LOSS Ex2] A: 0.12202 | B: 0.29275 | C: 0.21722 ** [JOINT LOSS] ** : 0.829255 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004910 | Grad Max: 0.161685 -> Layer: shared_layers.0.bias | Grad Mean: 0.156715 | Grad Max: 0.799183 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.006426 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003505 | Grad Max: 0.003505 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001152 | Grad Max: 0.279809 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019708 | Grad Max: 1.562342 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.004920 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007434 | Grad Max: 0.047898 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000258 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001603 | Grad Max: 0.004934 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000128 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000376 | Grad Max: 0.001278 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000376 | Grad Max: 0.001361 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004733 | Grad Max: 0.004733 [GRADIENT NORM TOTAL] 3.8622 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.911 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5013079 0.49869213] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.080 [MASKS] A(Pass/Fail): 687/1361 | B: 608/1248 | C: 586/1462 [LOSS Ex1] A: 0.63725 | B: 0.61709 | C: 0.60854 [LOGITS Ex2 A] Mean Abs: 2.220 | Max: 6.264 [LOSS Ex2] A: 0.10556 | B: 0.30868 | C: 0.23407 ** [JOINT LOSS] ** : 0.837063 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003104 | Grad Max: 0.095839 -> Layer: shared_layers.0.bias | Grad Mean: 0.256144 | Grad Max: 1.286984 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.005713 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004798 | Grad Max: 0.004798 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001597 | Grad Max: 0.549105 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028952 | Grad Max: 3.059704 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000160 | Grad Max: 0.005769 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013527 | Grad Max: 0.065727 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000297 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002706 | Grad Max: 0.006246 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000190 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000651 | Grad Max: 0.002439 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.001473 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009918 | Grad Max: 0.009918 [GRADIENT NORM TOTAL] 6.8848 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.819 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54354715 0.45645288] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.077 [MASKS] A(Pass/Fail): 688/1360 | B: 655/1393 | C: 586/1462 [LOSS Ex1] A: 0.63772 | B: 0.61669 | C: 0.61281 [LOGITS Ex2 A] Mean Abs: 2.225 | Max: 6.296 [LOSS Ex2] A: 0.11460 | B: 0.31993 | C: 0.25148 ** [JOINT LOSS] ** : 0.851075 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005479 | Grad Max: 0.175898 -> Layer: shared_layers.0.bias | Grad Mean: 0.154084 | Grad Max: 0.826623 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005513 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008562 | Grad Max: 0.008562 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001312 | Grad Max: 0.470352 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022797 | Grad Max: 2.624824 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000117 | Grad Max: 0.003548 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009052 | Grad Max: 0.041698 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000264 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002049 | Grad Max: 0.005141 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000127 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000551 | Grad Max: 0.001754 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000401 | Grad Max: 0.001339 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010071 | Grad Max: 0.010071 [GRADIENT NORM TOTAL] 4.8038 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.022 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7973016 0.20269844] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.082 [MASKS] A(Pass/Fail): 748/1300 | B: 644/1404 | C: 591/1457 [LOSS Ex1] A: 0.63128 | B: 0.61719 | C: 0.61102 [LOGITS Ex2 A] Mean Abs: 2.314 | Max: 7.351 [LOSS Ex2] A: 0.09691 | B: 0.31611 | C: 0.22774 ** [JOINT LOSS] ** : 0.833413 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005985 | Grad Max: 0.233211 -> Layer: shared_layers.0.bias | Grad Mean: 0.614676 | Grad Max: 3.069510 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.006133 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001937 | Grad Max: 0.001937 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003906 | Grad Max: 0.656105 -> Layer: exit2_layers.0.bias | Grad Mean: 0.072788 | Grad Max: 3.667587 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000439 | Grad Max: 0.016200 -> Layer: exit2_layers.3.bias | Grad Mean: 0.038241 | Grad Max: 0.226423 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000549 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007407 | Grad Max: 0.014691 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000393 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001799 | Grad Max: 0.005117 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000944 | Grad Max: 0.002443 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026145 | Grad Max: 0.026145 [GRADIENT NORM TOTAL] 13.5825 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.136 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000295 0.49997053] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.081 [MASKS] A(Pass/Fail): 722/1326 | B: 646/1402 | C: 598/1450 [LOSS Ex1] A: 0.63835 | B: 0.61273 | C: 0.60777 [LOGITS Ex2 A] Mean Abs: 2.303 | Max: 6.290 [LOSS Ex2] A: 0.09401 | B: 0.30569 | C: 0.22571 ** [JOINT LOSS] ** : 0.828085 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003216 | Grad Max: 0.151026 -> Layer: shared_layers.0.bias | Grad Mean: 0.372298 | Grad Max: 2.001792 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002113 | Grad Max: 0.005477 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005665 | Grad Max: 0.005665 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002544 | Grad Max: 0.476935 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046827 | Grad Max: 2.678825 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000240 | Grad Max: 0.010679 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020948 | Grad Max: 0.133889 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000338 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003918 | Grad Max: 0.009144 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000199 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000924 | Grad Max: 0.003122 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000441 | Grad Max: 0.001525 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012729 | Grad Max: 0.012729 [GRADIENT NORM TOTAL] 9.2387 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.802 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7290793 0.2709207] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.082 [MASKS] A(Pass/Fail): 713/1335 | B: 608/1248 | C: 563/1485 [LOSS Ex1] A: 0.63365 | B: 0.61691 | C: 0.61223 [LOGITS Ex2 A] Mean Abs: 2.245 | Max: 6.748 [LOSS Ex2] A: 0.12569 | B: 0.30188 | C: 0.21097 ** [JOINT LOSS] ** : 0.833780 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005138 | Grad Max: 0.171460 -> Layer: shared_layers.0.bias | Grad Mean: 0.207652 | Grad Max: 0.811173 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.005922 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002197 | Grad Max: 0.002197 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001526 | Grad Max: 0.173822 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026818 | Grad Max: 0.877980 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000155 | Grad Max: 0.005259 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012647 | Grad Max: 0.071175 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000291 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002589 | Grad Max: 0.006558 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000176 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000593 | Grad Max: 0.002490 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000392 | Grad Max: 0.001660 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008600 | Grad Max: 0.008600 [GRADIENT NORM TOTAL] 4.4059 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.927 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63589865 0.36410135] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.082 [MASKS] A(Pass/Fail): 590/1026 | B: 655/1393 | C: 589/1459 [LOSS Ex1] A: 0.63181 | B: 0.61652 | C: 0.60619 [LOGITS Ex2 A] Mean Abs: 2.300 | Max: 9.895 [LOSS Ex2] A: 0.10115 | B: 0.31733 | C: 0.19664 ** [JOINT LOSS] ** : 0.823216 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003780 | Grad Max: 0.100368 -> Layer: shared_layers.0.bias | Grad Mean: 0.248829 | Grad Max: 0.981002 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.005803 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005058 | Grad Max: 0.005058 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001480 | Grad Max: 0.230409 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026743 | Grad Max: 1.281951 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000128 | Grad Max: 0.004986 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010899 | Grad Max: 0.072478 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000220 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001989 | Grad Max: 0.005390 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000160 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000431 | Grad Max: 0.001796 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000393 | Grad Max: 0.001614 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006096 | Grad Max: 0.006096 [GRADIENT NORM TOTAL] 5.1870 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.138 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50800174 0.49199826] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.082 [MASKS] A(Pass/Fail): 722/1326 | B: 644/1404 | C: 563/1485 [LOSS Ex1] A: 0.63253 | B: 0.61702 | C: 0.61314 [LOGITS Ex2 A] Mean Abs: 2.312 | Max: 8.742 [LOSS Ex2] A: 0.10277 | B: 0.31695 | C: 0.24608 ** [JOINT LOSS] ** : 0.842829 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007037 | Grad Max: 0.189326 -> Layer: shared_layers.0.bias | Grad Mean: 0.455688 | Grad Max: 2.230294 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.005687 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005237 | Grad Max: 0.005237 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003196 | Grad Max: 0.539846 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058896 | Grad Max: 3.035585 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000342 | Grad Max: 0.011699 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029193 | Grad Max: 0.171446 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000530 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005812 | Grad Max: 0.012738 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000308 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001400 | Grad Max: 0.004380 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000741 | Grad Max: 0.002006 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019827 | Grad Max: 0.019827 [GRADIENT NORM TOTAL] 10.4140 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50755113 0.49244887] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.082 [MASKS] A(Pass/Fail): 715/1333 | B: 646/1402 | C: 568/1480 [LOSS Ex1] A: 0.62923 | B: 0.61256 | C: 0.61511 [LOGITS Ex2 A] Mean Abs: 2.316 | Max: 6.760 [LOSS Ex2] A: 0.10934 | B: 0.30089 | C: 0.20418 ** [JOINT LOSS] ** : 0.823769 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007539 | Grad Max: 0.195031 -> Layer: shared_layers.0.bias | Grad Mean: 0.375303 | Grad Max: 1.520992 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002248 | Grad Max: 0.005989 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003333 | Grad Max: 0.003333 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002709 | Grad Max: 0.390236 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049560 | Grad Max: 2.118659 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000290 | Grad Max: 0.008869 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024408 | Grad Max: 0.130866 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000409 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005019 | Grad Max: 0.010423 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000251 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001243 | Grad Max: 0.003694 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000705 | Grad Max: 0.002193 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018946 | Grad Max: 0.018946 [GRADIENT NORM TOTAL] 8.4697 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.107 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082463 0.49175373] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.082 [MASKS] A(Pass/Fail): 717/1331 | B: 608/1248 | C: 368/1008 [LOSS Ex1] A: 0.62819 | B: 0.61675 | C: 0.61723 [LOGITS Ex2 A] Mean Abs: 2.231 | Max: 6.957 [LOSS Ex2] A: 0.11802 | B: 0.30828 | C: 0.22250 ** [JOINT LOSS] ** : 0.836989 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004290 | Grad Max: 0.181603 -> Layer: shared_layers.0.bias | Grad Mean: 0.474487 | Grad Max: 2.337373 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002186 | Grad Max: 0.006307 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004165 | Grad Max: 0.004165 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002905 | Grad Max: 0.591017 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053546 | Grad Max: 3.244539 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000320 | Grad Max: 0.010252 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027756 | Grad Max: 0.143026 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000478 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005185 | Grad Max: 0.012410 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000265 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001251 | Grad Max: 0.003783 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000729 | Grad Max: 0.002165 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019502 | Grad Max: 0.019502 [GRADIENT NORM TOTAL] 10.4690 [EPOCH SUMMARY] Train Loss: 0.8335 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8202 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 151/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.916 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012677 0.4987323] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.081 [MASKS] A(Pass/Fail): 688/1360 | B: 656/1392 | C: 562/1486 [LOSS Ex1] A: 0.63703 | B: 0.61637 | C: 0.61047 [LOGITS Ex2 A] Mean Abs: 2.213 | Max: 5.732 [LOSS Ex2] A: 0.10685 | B: 0.33384 | C: 0.21725 ** [JOINT LOSS] ** : 0.840601 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004339 | Grad Max: 0.201389 -> Layer: shared_layers.0.bias | Grad Mean: 0.531898 | Grad Max: 2.716517 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.005494 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003998 | Grad Max: 0.003998 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003505 | Grad Max: 0.673421 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065270 | Grad Max: 3.781298 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000372 | Grad Max: 0.013752 -> Layer: exit2_layers.3.bias | Grad Mean: 0.032657 | Grad Max: 0.177564 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000487 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006344 | Grad Max: 0.013438 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000326 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001516 | Grad Max: 0.004482 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000851 | Grad Max: 0.002172 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023054 | Grad Max: 0.023054 [GRADIENT NORM TOTAL] 12.3560 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.822 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5435288 0.4564712] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.078 [MASKS] A(Pass/Fail): 688/1360 | B: 644/1404 | C: 600/1448 [LOSS Ex1] A: 0.63751 | B: 0.61687 | C: 0.60988 [LOGITS Ex2 A] Mean Abs: 2.230 | Max: 5.923 [LOSS Ex2] A: 0.11357 | B: 0.31748 | C: 0.22220 ** [JOINT LOSS] ** : 0.839170 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.052870 -> Layer: shared_layers.0.bias | Grad Mean: 0.154794 | Grad Max: 1.036783 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.005690 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006130 | Grad Max: 0.006130 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000994 | Grad Max: 0.492011 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017731 | Grad Max: 2.736600 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.003463 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006143 | Grad Max: 0.036642 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000189 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001110 | Grad Max: 0.004088 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000092 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000242 | Grad Max: 0.001096 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000338 | Grad Max: 0.001319 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004298 | Grad Max: 0.004298 [GRADIENT NORM TOTAL] 4.6773 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.028 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7982587 0.2017413] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.082 [MASKS] A(Pass/Fail): 748/1300 | B: 646/1402 | C: 600/1448 [LOSS Ex1] A: 0.63108 | B: 0.61241 | C: 0.60652 [LOGITS Ex2 A] Mean Abs: 2.310 | Max: 7.499 [LOSS Ex2] A: 0.10190 | B: 0.31319 | C: 0.21767 ** [JOINT LOSS] ** : 0.827590 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009020 | Grad Max: 0.292535 -> Layer: shared_layers.0.bias | Grad Mean: 0.790358 | Grad Max: 3.739343 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002304 | Grad Max: 0.005757 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004533 | Grad Max: 0.004533 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005048 | Grad Max: 0.837127 -> Layer: exit2_layers.0.bias | Grad Mean: 0.093799 | Grad Max: 4.649765 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000570 | Grad Max: 0.019663 -> Layer: exit2_layers.3.bias | Grad Mean: 0.049573 | Grad Max: 0.254771 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000720 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009735 | Grad Max: 0.020284 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000431 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002364 | Grad Max: 0.006615 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001290 | Grad Max: 0.002965 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035539 | Grad Max: 0.035539 [GRADIENT NORM TOTAL] 17.5784 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 1.142 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000022 0.49999776] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.081 [MASKS] A(Pass/Fail): 723/1325 | B: 608/1248 | C: 579/1469 [LOSS Ex1] A: 0.63816 | B: 0.61660 | C: 0.61075 [LOGITS Ex2 A] Mean Abs: 2.341 | Max: 6.679 [LOSS Ex2] A: 0.09228 | B: 0.31556 | C: 0.22008 ** [JOINT LOSS] ** : 0.831138 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005704 | Grad Max: 0.255763 -> Layer: shared_layers.0.bias | Grad Mean: 0.687486 | Grad Max: 3.456053 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.005613 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002492 | Grad Max: 0.002492 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004593 | Grad Max: 0.797272 -> Layer: exit2_layers.0.bias | Grad Mean: 0.086076 | Grad Max: 4.467910 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000506 | Grad Max: 0.017860 -> Layer: exit2_layers.3.bias | Grad Mean: 0.044409 | Grad Max: 0.257190 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000647 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008490 | Grad Max: 0.016977 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000373 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002051 | Grad Max: 0.005779 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001128 | Grad Max: 0.002810 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031276 | Grad Max: 0.031276 [GRADIENT NORM TOTAL] 16.1804 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.805 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7296981 0.27030194] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.082 [MASKS] A(Pass/Fail): 714/1334 | B: 656/1392 | C: 577/1471 [LOSS Ex1] A: 0.63346 | B: 0.61623 | C: 0.61105 [LOGITS Ex2 A] Mean Abs: 2.286 | Max: 6.397 [LOSS Ex2] A: 0.12938 | B: 0.31952 | C: 0.22544 ** [JOINT LOSS] ** : 0.845023 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003538 | Grad Max: 0.129949 -> Layer: shared_layers.0.bias | Grad Mean: 0.249851 | Grad Max: 1.255929 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.006008 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001259 | Grad Max: 0.001259 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001779 | Grad Max: 0.507691 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031229 | Grad Max: 2.803877 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000173 | Grad Max: 0.006619 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014324 | Grad Max: 0.093441 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000285 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002477 | Grad Max: 0.006708 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000150 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000571 | Grad Max: 0.001860 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000296 | Grad Max: 0.001011 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008399 | Grad Max: 0.008399 [GRADIENT NORM TOTAL] 6.5383 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.931 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6362463 0.36375368] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.082 [MASKS] A(Pass/Fail): 590/1026 | B: 644/1404 | C: 567/1481 [LOSS Ex1] A: 0.63162 | B: 0.61673 | C: 0.60957 [LOGITS Ex2 A] Mean Abs: 2.271 | Max: 10.422 [LOSS Ex2] A: 0.10432 | B: 0.33973 | C: 0.22280 ** [JOINT LOSS] ** : 0.841587 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010165 | Grad Max: 0.282641 -> Layer: shared_layers.0.bias | Grad Mean: 0.786371 | Grad Max: 3.568820 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005939 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008858 | Grad Max: 0.008858 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004982 | Grad Max: 0.997391 -> Layer: exit2_layers.0.bias | Grad Mean: 0.092267 | Grad Max: 5.525179 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000575 | Grad Max: 0.020396 -> Layer: exit2_layers.3.bias | Grad Mean: 0.049734 | Grad Max: 0.276255 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000712 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009836 | Grad Max: 0.019348 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000479 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002390 | Grad Max: 0.007006 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001425 | Grad Max: 0.003299 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037422 | Grad Max: 0.037422 [GRADIENT NORM TOTAL] 17.1091 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.143 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.508025 0.49197498] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 724/1324 | B: 646/1402 | C: 589/1459 [LOSS Ex1] A: 0.63234 | B: 0.61228 | C: 0.60798 [LOGITS Ex2 A] Mean Abs: 2.237 | Max: 8.135 [LOSS Ex2] A: 0.10480 | B: 0.32722 | C: 0.22659 ** [JOINT LOSS] ** : 0.837069 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010824 | Grad Max: 0.303588 -> Layer: shared_layers.0.bias | Grad Mean: 0.888505 | Grad Max: 3.964028 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002204 | Grad Max: 0.005753 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000069 | Grad Max: 0.000069 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005707 | Grad Max: 0.737754 -> Layer: exit2_layers.0.bias | Grad Mean: 0.105968 | Grad Max: 4.087673 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000678 | Grad Max: 0.022336 -> Layer: exit2_layers.3.bias | Grad Mean: 0.059058 | Grad Max: 0.299503 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.000833 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011667 | Grad Max: 0.023381 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000553 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002824 | Grad Max: 0.008246 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001632 | Grad Max: 0.003647 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043353 | Grad Max: 0.043353 [GRADIENT NORM TOTAL] 18.7425 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.078 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50751644 0.49248362] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.082 [MASKS] A(Pass/Fail): 717/1331 | B: 608/1248 | C: 601/1447 [LOSS Ex1] A: 0.62904 | B: 0.61647 | C: 0.61088 [LOGITS Ex2 A] Mean Abs: 2.261 | Max: 7.683 [LOSS Ex2] A: 0.10786 | B: 0.31246 | C: 0.22586 ** [JOINT LOSS] ** : 0.834191 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003976 | Grad Max: 0.157316 -> Layer: shared_layers.0.bias | Grad Mean: 0.458804 | Grad Max: 2.073266 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.006711 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000579 | Grad Max: 0.000579 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002993 | Grad Max: 0.444279 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055942 | Grad Max: 2.479338 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000353 | Grad Max: 0.011360 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030924 | Grad Max: 0.162107 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000461 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005933 | Grad Max: 0.012217 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000263 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001442 | Grad Max: 0.004022 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000833 | Grad Max: 0.002175 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022508 | Grad Max: 0.022508 [GRADIENT NORM TOTAL] 10.1809 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.111 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50833374 0.4916663 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.082 [MASKS] A(Pass/Fail): 717/1331 | B: 656/1392 | C: 580/1468 [LOSS Ex1] A: 0.62801 | B: 0.61611 | C: 0.61178 [LOGITS Ex2 A] Mean Abs: 2.281 | Max: 6.703 [LOSS Ex2] A: 0.12099 | B: 0.31973 | C: 0.21952 ** [JOINT LOSS] ** : 0.838712 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008554 | Grad Max: 0.269242 -> Layer: shared_layers.0.bias | Grad Mean: 0.671972 | Grad Max: 3.141446 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002225 | Grad Max: 0.006321 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002148 | Grad Max: 0.002148 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004412 | Grad Max: 0.819476 -> Layer: exit2_layers.0.bias | Grad Mean: 0.080487 | Grad Max: 4.584902 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000481 | Grad Max: 0.017567 -> Layer: exit2_layers.3.bias | Grad Mean: 0.041474 | Grad Max: 0.226098 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000652 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008325 | Grad Max: 0.016591 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000354 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002006 | Grad Max: 0.005410 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001100 | Grad Max: 0.002850 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030023 | Grad Max: 0.030023 [GRADIENT NORM TOTAL] 15.1571 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.919 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.501258 0.49874195] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.560 | Std: 0.081 [MASKS] A(Pass/Fail): 688/1360 | B: 644/1404 | C: 561/1487 [LOSS Ex1] A: 0.63687 | B: 0.61661 | C: 0.61584 [LOGITS Ex2 A] Mean Abs: 2.277 | Max: 6.040 [LOSS Ex2] A: 0.10405 | B: 0.34283 | C: 0.25895 ** [JOINT LOSS] ** : 0.858384 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011898 | Grad Max: 0.431039 -> Layer: shared_layers.0.bias | Grad Mean: 1.109747 | Grad Max: 5.819005 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002024 | Grad Max: 0.005277 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006799 | Grad Max: 0.006799 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007114 | Grad Max: 1.320729 -> Layer: exit2_layers.0.bias | Grad Mean: 0.131764 | Grad Max: 7.347874 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000758 | Grad Max: 0.025064 -> Layer: exit2_layers.3.bias | Grad Mean: 0.066136 | Grad Max: 0.343326 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.001027 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013011 | Grad Max: 0.027267 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000577 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003096 | Grad Max: 0.009182 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001652 | Grad Max: 0.003400 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044887 | Grad Max: 0.044887 [GRADIENT NORM TOTAL] 25.0426 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.132 | Max: 0.825 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54357105 0.45642892] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.078 [MASKS] A(Pass/Fail): 688/1360 | B: 647/1401 | C: 574/1474 [LOSS Ex1] A: 0.63736 | B: 0.61217 | C: 0.61002 [LOGITS Ex2 A] Mean Abs: 2.250 | Max: 6.199 [LOSS Ex2] A: 0.11908 | B: 0.30508 | C: 0.22787 ** [JOINT LOSS] ** : 0.837191 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009388 | Grad Max: 0.339843 -> Layer: shared_layers.0.bias | Grad Mean: 0.805733 | Grad Max: 4.273692 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.005494 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005268 | Grad Max: 0.005268 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005150 | Grad Max: 1.011205 -> Layer: exit2_layers.0.bias | Grad Mean: 0.094589 | Grad Max: 5.631574 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000560 | Grad Max: 0.018300 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048752 | Grad Max: 0.247341 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000704 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009694 | Grad Max: 0.018758 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000458 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002349 | Grad Max: 0.006670 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001251 | Grad Max: 0.003118 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034501 | Grad Max: 0.034501 [GRADIENT NORM TOTAL] 18.0836 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.031 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.79907197 0.200928 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.082 [MASKS] A(Pass/Fail): 750/1298 | B: 608/1248 | C: 583/1465 [LOSS Ex1] A: 0.63093 | B: 0.61636 | C: 0.60907 [LOGITS Ex2 A] Mean Abs: 2.221 | Max: 7.841 [LOSS Ex2] A: 0.11288 | B: 0.30435 | C: 0.21591 ** [JOINT LOSS] ** : 0.829831 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004184 | Grad Max: 0.151842 -> Layer: shared_layers.0.bias | Grad Mean: 0.136723 | Grad Max: 0.529207 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002192 | Grad Max: 0.005647 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000640 | Grad Max: 0.000640 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001030 | Grad Max: 0.215933 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017103 | Grad Max: 1.198506 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000081 | Grad Max: 0.005083 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005272 | Grad Max: 0.047567 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000157 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000743 | Grad Max: 0.003360 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000073 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000144 | Grad Max: 0.000852 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.000983 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001449 | Grad Max: 0.001449 [GRADIENT NORM TOTAL] 3.1318 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.146 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000136 0.49998638] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.081 [MASKS] A(Pass/Fail): 723/1325 | B: 656/1392 | C: 570/1478 [LOSS Ex1] A: 0.63802 | B: 0.61600 | C: 0.61062 [LOGITS Ex2 A] Mean Abs: 2.225 | Max: 6.652 [LOSS Ex2] A: 0.09023 | B: 0.33401 | C: 0.20457 ** [JOINT LOSS] ** : 0.831149 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005791 | Grad Max: 0.200564 -> Layer: shared_layers.0.bias | Grad Mean: 0.517420 | Grad Max: 2.775786 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.005836 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001847 | Grad Max: 0.001847 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003287 | Grad Max: 0.688409 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059880 | Grad Max: 3.866540 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.012739 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031187 | Grad Max: 0.164926 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000461 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006295 | Grad Max: 0.012501 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000305 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001521 | Grad Max: 0.004541 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000872 | Grad Max: 0.002626 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023129 | Grad Max: 0.023129 [GRADIENT NORM TOTAL] 11.6360 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.808 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7303075 0.2696925] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.082 [MASKS] A(Pass/Fail): 714/1334 | B: 645/1403 | C: 362/1014 [LOSS Ex1] A: 0.63331 | B: 0.61651 | C: 0.62121 [LOGITS Ex2 A] Mean Abs: 2.228 | Max: 8.244 [LOSS Ex2] A: 0.11683 | B: 0.31819 | C: 0.25745 ** [JOINT LOSS] ** : 0.854500 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005799 | Grad Max: 0.175994 -> Layer: shared_layers.0.bias | Grad Mean: 0.348157 | Grad Max: 1.968011 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.005664 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010299 | Grad Max: 0.010299 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.605284 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038378 | Grad Max: 3.379683 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000224 | Grad Max: 0.007793 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019051 | Grad Max: 0.092470 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000372 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003849 | Grad Max: 0.008297 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000209 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000974 | Grad Max: 0.002800 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000679 | Grad Max: 0.001717 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017521 | Grad Max: 0.017521 [GRADIENT NORM TOTAL] 7.7720 [EPOCH SUMMARY] Train Loss: 0.8390 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8193 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 152/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.934 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63655245 0.3634475 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.082 [MASKS] A(Pass/Fail): 590/1026 | B: 647/1401 | C: 561/1487 [LOSS Ex1] A: 0.63147 | B: 0.61205 | C: 0.61228 [LOGITS Ex2 A] Mean Abs: 2.298 | Max: 13.373 [LOSS Ex2] A: 0.10686 | B: 0.30889 | C: 0.21518 ** [JOINT LOSS] ** : 0.828916 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005457 | Grad Max: 0.173669 -> Layer: shared_layers.0.bias | Grad Mean: 0.495622 | Grad Max: 2.131582 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002175 | Grad Max: 0.005513 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004311 | Grad Max: 0.004311 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003357 | Grad Max: 0.406862 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060532 | Grad Max: 2.254767 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000343 | Grad Max: 0.014922 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030052 | Grad Max: 0.197044 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000462 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005504 | Grad Max: 0.011462 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000258 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001321 | Grad Max: 0.003846 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000632 | Grad Max: 0.001957 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019066 | Grad Max: 0.019066 [GRADIENT NORM TOTAL] 10.8484 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.147 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50803024 0.49196982] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 724/1324 | B: 608/1248 | C: 562/1486 [LOSS Ex1] A: 0.63220 | B: 0.61624 | C: 0.61288 [LOGITS Ex2 A] Mean Abs: 2.297 | Max: 9.391 [LOSS Ex2] A: 0.10037 | B: 0.30304 | C: 0.23481 ** [JOINT LOSS] ** : 0.833180 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005347 | Grad Max: 0.217880 -> Layer: shared_layers.0.bias | Grad Mean: 0.441438 | Grad Max: 1.986543 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.005741 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001087 | Grad Max: 0.001087 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003116 | Grad Max: 0.382197 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056290 | Grad Max: 2.031134 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000325 | Grad Max: 0.011591 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028273 | Grad Max: 0.170345 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000445 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005251 | Grad Max: 0.011290 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000236 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001256 | Grad Max: 0.003617 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000621 | Grad Max: 0.001842 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018391 | Grad Max: 0.018391 [GRADIENT NORM TOTAL] 10.0786 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.082 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50752765 0.49247238] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.082 [MASKS] A(Pass/Fail): 717/1331 | B: 656/1392 | C: 568/1480 [LOSS Ex1] A: 0.62889 | B: 0.61589 | C: 0.60978 [LOGITS Ex2 A] Mean Abs: 2.278 | Max: 6.451 [LOSS Ex2] A: 0.10789 | B: 0.32146 | C: 0.20706 ** [JOINT LOSS] ** : 0.830326 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003653 | Grad Max: 0.143095 -> Layer: shared_layers.0.bias | Grad Mean: 0.146542 | Grad Max: 0.830313 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.006506 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006131 | Grad Max: 0.006131 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000851 | Grad Max: 0.583100 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014159 | Grad Max: 3.224070 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.002082 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002550 | Grad Max: 0.024211 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000196 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000578 | Grad Max: 0.002755 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000086 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000155 | Grad Max: 0.000922 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000307 | Grad Max: 0.001198 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002844 | Grad Max: 0.002844 [GRADIENT NORM TOTAL] 4.6242 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.115 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50838417 0.4916158 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.082 [MASKS] A(Pass/Fail): 717/1331 | B: 645/1403 | C: 588/1460 [LOSS Ex1] A: 0.62786 | B: 0.61639 | C: 0.60812 [LOGITS Ex2 A] Mean Abs: 2.221 | Max: 8.410 [LOSS Ex2] A: 0.11564 | B: 0.32180 | C: 0.21617 ** [JOINT LOSS] ** : 0.835326 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003996 | Grad Max: 0.124445 -> Layer: shared_layers.0.bias | Grad Mean: 0.316463 | Grad Max: 1.477840 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.006334 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001828 | Grad Max: 0.001828 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001997 | Grad Max: 0.242869 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036701 | Grad Max: 1.353868 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.007237 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017498 | Grad Max: 0.091919 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000330 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003462 | Grad Max: 0.007748 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000233 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000876 | Grad Max: 0.003306 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000539 | Grad Max: 0.002171 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014585 | Grad Max: 0.014585 [GRADIENT NORM TOTAL] 6.7530 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.923 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5012295 0.49877048] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.081 [MASKS] A(Pass/Fail): 688/1360 | B: 647/1401 | C: 589/1459 [LOSS Ex1] A: 0.63673 | B: 0.61194 | C: 0.60922 [LOGITS Ex2 A] Mean Abs: 2.181 | Max: 5.920 [LOSS Ex2] A: 0.10234 | B: 0.29548 | C: 0.22460 ** [JOINT LOSS] ** : 0.826765 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002653 | Grad Max: 0.064343 -> Layer: shared_layers.0.bias | Grad Mean: 0.145974 | Grad Max: 0.706828 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.006234 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009743 | Grad Max: 0.009743 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001080 | Grad Max: 0.183179 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019391 | Grad Max: 1.017213 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000112 | Grad Max: 0.004529 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009528 | Grad Max: 0.053411 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000192 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001776 | Grad Max: 0.004876 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000142 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000432 | Grad Max: 0.001745 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000421 | Grad Max: 0.001797 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007732 | Grad Max: 0.007732 [GRADIENT NORM TOTAL] 3.6728 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.828 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5435763 0.4564237] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.078 [MASKS] A(Pass/Fail): 688/1360 | B: 608/1248 | C: 593/1455 [LOSS Ex1] A: 0.63722 | B: 0.61612 | C: 0.60387 [LOGITS Ex2 A] Mean Abs: 2.234 | Max: 5.961 [LOSS Ex2] A: 0.11484 | B: 0.30163 | C: 0.21732 ** [JOINT LOSS] ** : 0.830330 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007294 | Grad Max: 0.247386 -> Layer: shared_layers.0.bias | Grad Mean: 0.579786 | Grad Max: 3.179410 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002137 | Grad Max: 0.005739 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007377 | Grad Max: 0.007377 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003657 | Grad Max: 0.723002 -> Layer: exit2_layers.0.bias | Grad Mean: 0.067811 | Grad Max: 4.029886 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000397 | Grad Max: 0.012938 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034291 | Grad Max: 0.181128 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000511 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006822 | Grad Max: 0.013885 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000335 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001685 | Grad Max: 0.004445 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000929 | Grad Max: 0.002513 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025458 | Grad Max: 0.025458 [GRADIENT NORM TOTAL] 13.0304 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.035 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7998073 0.20019267] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.082 [MASKS] A(Pass/Fail): 750/1298 | B: 656/1392 | C: 584/1464 [LOSS Ex1] A: 0.63077 | B: 0.61577 | C: 0.61039 [LOGITS Ex2 A] Mean Abs: 2.259 | Max: 7.347 [LOSS Ex2] A: 0.10885 | B: 0.32002 | C: 0.23630 ** [JOINT LOSS] ** : 0.840702 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010097 | Grad Max: 0.257659 -> Layer: shared_layers.0.bias | Grad Mean: 0.666161 | Grad Max: 3.093738 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.005990 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000597 | Grad Max: 0.000597 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004334 | Grad Max: 0.822711 -> Layer: exit2_layers.0.bias | Grad Mean: 0.080152 | Grad Max: 4.596558 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000505 | Grad Max: 0.015973 -> Layer: exit2_layers.3.bias | Grad Mean: 0.043609 | Grad Max: 0.215492 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000674 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008688 | Grad Max: 0.017676 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000415 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002116 | Grad Max: 0.006195 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001151 | Grad Max: 0.002819 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031240 | Grad Max: 0.031240 [GRADIENT NORM TOTAL] 14.7961 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.150 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50003743 0.49996254] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 723/1325 | B: 645/1403 | C: 596/1452 [LOSS Ex1] A: 0.63786 | B: 0.61627 | C: 0.60775 [LOGITS Ex2 A] Mean Abs: 2.271 | Max: 6.306 [LOSS Ex2] A: 0.09850 | B: 0.30807 | C: 0.21832 ** [JOINT LOSS] ** : 0.828924 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004477 | Grad Max: 0.107407 -> Layer: shared_layers.0.bias | Grad Mean: 0.195126 | Grad Max: 0.891397 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.005417 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002383 | Grad Max: 0.002383 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001441 | Grad Max: 0.405407 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026162 | Grad Max: 2.251768 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000142 | Grad Max: 0.004571 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011842 | Grad Max: 0.060067 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000233 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002484 | Grad Max: 0.006139 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000155 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000646 | Grad Max: 0.002170 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000417 | Grad Max: 0.001322 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009600 | Grad Max: 0.009600 [GRADIENT NORM TOTAL] 5.1166 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.811 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.73086554 0.26913443] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.082 [MASKS] A(Pass/Fail): 714/1334 | B: 648/1400 | C: 570/1478 [LOSS Ex1] A: 0.63314 | B: 0.61181 | C: 0.61333 [LOGITS Ex2 A] Mean Abs: 2.181 | Max: 6.727 [LOSS Ex2] A: 0.11901 | B: 0.31572 | C: 0.21062 ** [JOINT LOSS] ** : 0.834546 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007148 | Grad Max: 0.256460 -> Layer: shared_layers.0.bias | Grad Mean: 0.733536 | Grad Max: 3.509427 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.006053 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009260 | Grad Max: 0.009260 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004730 | Grad Max: 0.781187 -> Layer: exit2_layers.0.bias | Grad Mean: 0.088095 | Grad Max: 4.386971 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000546 | Grad Max: 0.018476 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048221 | Grad Max: 0.262483 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000751 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009336 | Grad Max: 0.020172 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000452 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002251 | Grad Max: 0.007085 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001209 | Grad Max: 0.003141 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033623 | Grad Max: 0.033623 [GRADIENT NORM TOTAL] 16.1767 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.938 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6367706 0.36322936] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.083 [MASKS] A(Pass/Fail): 590/1026 | B: 608/1248 | C: 574/1474 [LOSS Ex1] A: 0.63129 | B: 0.61600 | C: 0.61383 [LOGITS Ex2 A] Mean Abs: 2.209 | Max: 9.394 [LOSS Ex2] A: 0.11230 | B: 0.34648 | C: 0.24307 ** [JOINT LOSS] ** : 0.854322 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008503 | Grad Max: 0.340423 -> Layer: shared_layers.0.bias | Grad Mean: 0.978707 | Grad Max: 4.525155 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.006051 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003400 | Grad Max: 0.003400 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006172 | Grad Max: 0.845343 -> Layer: exit2_layers.0.bias | Grad Mean: 0.115138 | Grad Max: 4.747772 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000735 | Grad Max: 0.024511 -> Layer: exit2_layers.3.bias | Grad Mean: 0.065453 | Grad Max: 0.331815 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.000908 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012840 | Grad Max: 0.025138 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000584 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003145 | Grad Max: 0.008764 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001673 | Grad Max: 0.003620 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047505 | Grad Max: 0.047505 [GRADIENT NORM TOTAL] 21.0842 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.151 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50807613 0.49192384] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 724/1324 | B: 656/1392 | C: 567/1481 [LOSS Ex1] A: 0.63202 | B: 0.61566 | C: 0.61403 [LOGITS Ex2 A] Mean Abs: 2.230 | Max: 8.092 [LOSS Ex2] A: 0.10005 | B: 0.34093 | C: 0.23409 ** [JOINT LOSS] ** : 0.845595 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004763 | Grad Max: 0.235202 -> Layer: shared_layers.0.bias | Grad Mean: 0.552336 | Grad Max: 3.177644 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.005950 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001820 | Grad Max: 0.001820 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003490 | Grad Max: 0.659501 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064662 | Grad Max: 3.711680 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000385 | Grad Max: 0.013048 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034443 | Grad Max: 0.176715 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000541 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006703 | Grad Max: 0.014614 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000312 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001647 | Grad Max: 0.005036 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000894 | Grad Max: 0.002505 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025207 | Grad Max: 0.025207 [GRADIENT NORM TOTAL] 13.0881 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50740045 0.49259952] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 717/1331 | B: 645/1403 | C: 569/1479 [LOSS Ex1] A: 0.62871 | B: 0.61616 | C: 0.61103 [LOGITS Ex2 A] Mean Abs: 2.261 | Max: 6.750 [LOSS Ex2] A: 0.11140 | B: 0.31472 | C: 0.21630 ** [JOINT LOSS] ** : 0.832776 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004154 | Grad Max: 0.146170 -> Layer: shared_layers.0.bias | Grad Mean: 0.352488 | Grad Max: 1.840593 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006062 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000933 | Grad Max: 0.000933 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002451 | Grad Max: 0.508351 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045527 | Grad Max: 2.846020 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.007116 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021430 | Grad Max: 0.101610 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000368 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004163 | Grad Max: 0.009193 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001008 | Grad Max: 0.003011 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000524 | Grad Max: 0.001864 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014692 | Grad Max: 0.014692 [GRADIENT NORM TOTAL] 8.4936 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.119 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50849265 0.49150735] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.082 [MASKS] A(Pass/Fail): 717/1331 | B: 648/1400 | C: 570/1478 [LOSS Ex1] A: 0.62768 | B: 0.61170 | C: 0.61551 [LOGITS Ex2 A] Mean Abs: 2.255 | Max: 7.774 [LOSS Ex2] A: 0.11817 | B: 0.31445 | C: 0.23903 ** [JOINT LOSS] ** : 0.842181 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007535 | Grad Max: 0.259646 -> Layer: shared_layers.0.bias | Grad Mean: 0.726010 | Grad Max: 3.326264 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002255 | Grad Max: 0.006212 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004580 | Grad Max: 0.004580 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004844 | Grad Max: 0.964914 -> Layer: exit2_layers.0.bias | Grad Mean: 0.089828 | Grad Max: 5.396534 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000553 | Grad Max: 0.015983 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048983 | Grad Max: 0.236670 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000708 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009651 | Grad Max: 0.019401 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000408 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002325 | Grad Max: 0.006333 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001195 | Grad Max: 0.002776 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033823 | Grad Max: 0.033823 [GRADIENT NORM TOTAL] 16.3822 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.926 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50124663 0.4987534 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.081 [MASKS] A(Pass/Fail): 688/1360 | B: 608/1248 | C: 430/946 [LOSS Ex1] A: 0.63655 | B: 0.61590 | C: 0.60226 [LOGITS Ex2 A] Mean Abs: 2.215 | Max: 6.590 [LOSS Ex2] A: 0.10543 | B: 0.30119 | C: 0.20923 ** [JOINT LOSS] ** : 0.823522 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002768 | Grad Max: 0.111836 -> Layer: shared_layers.0.bias | Grad Mean: 0.246830 | Grad Max: 1.527001 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.005638 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004570 | Grad Max: 0.004570 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001650 | Grad Max: 0.264622 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029936 | Grad Max: 1.469090 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000146 | Grad Max: 0.007582 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012700 | Grad Max: 0.084712 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000233 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002396 | Grad Max: 0.005929 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000140 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000604 | Grad Max: 0.001914 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000403 | Grad Max: 0.001398 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008744 | Grad Max: 0.008744 [GRADIENT NORM TOTAL] 5.8505 [EPOCH SUMMARY] Train Loss: 0.8348 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8191 | Alpha: 0.5500 No improve count: 3/15 ############################## EPOCH 153/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.831 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54353017 0.45646977] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.559 | Std: 0.078 [MASKS] A(Pass/Fail): 688/1360 | B: 656/1392 | C: 599/1449 [LOSS Ex1] A: 0.63706 | B: 0.61556 | C: 0.60683 [LOGITS Ex2 A] Mean Abs: 2.148 | Max: 6.852 [LOSS Ex2] A: 0.10949 | B: 0.32987 | C: 0.20835 ** [JOINT LOSS] ** : 0.835717 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006632 | Grad Max: 0.171242 -> Layer: shared_layers.0.bias | Grad Mean: 0.511304 | Grad Max: 2.288469 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.005759 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008658 | Grad Max: 0.008658 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003512 | Grad Max: 0.868056 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064455 | Grad Max: 4.803056 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000376 | Grad Max: 0.012559 -> Layer: exit2_layers.3.bias | Grad Mean: 0.032782 | Grad Max: 0.163594 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000517 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006561 | Grad Max: 0.014179 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000361 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001585 | Grad Max: 0.005367 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000889 | Grad Max: 0.002670 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024111 | Grad Max: 0.024111 [GRADIENT NORM TOTAL] 12.1187 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.039 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8005527 0.1994473] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.083 [MASKS] A(Pass/Fail): 750/1298 | B: 645/1403 | C: 569/1479 [LOSS Ex1] A: 0.63061 | B: 0.61605 | C: 0.61226 [LOGITS Ex2 A] Mean Abs: 2.187 | Max: 6.090 [LOSS Ex2] A: 0.10188 | B: 0.32635 | C: 0.22050 ** [JOINT LOSS] ** : 0.835888 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004257 | Grad Max: 0.222570 -> Layer: shared_layers.0.bias | Grad Mean: 0.575908 | Grad Max: 2.820241 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002109 | Grad Max: 0.006238 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000955 | Grad Max: 0.000955 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003712 | Grad Max: 0.847083 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069397 | Grad Max: 4.689932 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000418 | Grad Max: 0.013599 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037634 | Grad Max: 0.189873 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000595 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007313 | Grad Max: 0.015528 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000386 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001774 | Grad Max: 0.005724 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000958 | Grad Max: 0.002754 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026793 | Grad Max: 0.026793 [GRADIENT NORM TOTAL] 13.2141 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.154 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5000642 0.49993578] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.081 [MASKS] A(Pass/Fail): 723/1325 | B: 648/1400 | C: 598/1450 [LOSS Ex1] A: 0.63772 | B: 0.61160 | C: 0.61127 [LOGITS Ex2 A] Mean Abs: 2.211 | Max: 6.384 [LOSS Ex2] A: 0.09957 | B: 0.29309 | C: 0.22728 ** [JOINT LOSS] ** : 0.826838 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002842 | Grad Max: 0.080170 -> Layer: shared_layers.0.bias | Grad Mean: 0.141178 | Grad Max: 0.585776 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005148 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001837 | Grad Max: 0.001837 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001054 | Grad Max: 0.615192 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018772 | Grad Max: 3.414342 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.003104 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004728 | Grad Max: 0.028755 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000184 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000954 | Grad Max: 0.003609 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000091 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000221 | Grad Max: 0.001181 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000406 | Grad Max: 0.001592 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004183 | Grad Max: 0.004183 [GRADIENT NORM TOTAL] 5.2314 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.813 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7313856 0.26861444] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.082 [MASKS] A(Pass/Fail): 713/1335 | B: 608/1248 | C: 579/1469 [LOSS Ex1] A: 0.63299 | B: 0.61579 | C: 0.61195 [LOGITS Ex2 A] Mean Abs: 2.212 | Max: 6.461 [LOSS Ex2] A: 0.11579 | B: 0.30684 | C: 0.23140 ** [JOINT LOSS] ** : 0.838258 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004304 | Grad Max: 0.189417 -> Layer: shared_layers.0.bias | Grad Mean: 0.456047 | Grad Max: 2.481765 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002090 | Grad Max: 0.005495 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004815 | Grad Max: 0.004815 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003028 | Grad Max: 0.581367 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055752 | Grad Max: 3.257293 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000315 | Grad Max: 0.012178 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027978 | Grad Max: 0.145476 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000393 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005502 | Grad Max: 0.011682 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000307 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001343 | Grad Max: 0.004056 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000713 | Grad Max: 0.002214 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020502 | Grad Max: 0.020502 [GRADIENT NORM TOTAL] 10.6662 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 0.940 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63697577 0.36302426] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.083 [MASKS] A(Pass/Fail): 590/1026 | B: 656/1392 | C: 580/1468 [LOSS Ex1] A: 0.63114 | B: 0.61546 | C: 0.61003 [LOGITS Ex2 A] Mean Abs: 2.276 | Max: 9.944 [LOSS Ex2] A: 0.11038 | B: 0.31609 | C: 0.22633 ** [JOINT LOSS] ** : 0.836478 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004582 | Grad Max: 0.179477 -> Layer: shared_layers.0.bias | Grad Mean: 0.481807 | Grad Max: 2.393337 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.006155 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004277 | Grad Max: 0.004277 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003022 | Grad Max: 0.595395 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056510 | Grad Max: 3.322370 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000337 | Grad Max: 0.011269 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029815 | Grad Max: 0.142092 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000461 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005741 | Grad Max: 0.012139 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000283 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001374 | Grad Max: 0.004168 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000714 | Grad Max: 0.002043 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020125 | Grad Max: 0.020125 [GRADIENT NORM TOTAL] 10.7058 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.155 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5080868 0.49191314] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.083 [MASKS] A(Pass/Fail): 724/1324 | B: 644/1404 | C: 592/1456 [LOSS Ex1] A: 0.63187 | B: 0.61595 | C: 0.60509 [LOGITS Ex2 A] Mean Abs: 2.209 | Max: 8.015 [LOSS Ex2] A: 0.10016 | B: 0.31746 | C: 0.20784 ** [JOINT LOSS] ** : 0.826127 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002870 | Grad Max: 0.078955 -> Layer: shared_layers.0.bias | Grad Mean: 0.098221 | Grad Max: 0.683431 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.005708 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000468 | Grad Max: 0.000468 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000864 | Grad Max: 0.171044 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015247 | Grad Max: 0.926127 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000075 | Grad Max: 0.005291 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005803 | Grad Max: 0.079131 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000163 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001028 | Grad Max: 0.003771 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000111 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000247 | Grad Max: 0.001018 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000448 | Grad Max: 0.001455 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004226 | Grad Max: 0.004226 [GRADIENT NORM TOTAL] 2.8365 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.090 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5073296 0.49267048] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 717/1331 | B: 648/1400 | C: 568/1480 [LOSS Ex1] A: 0.62856 | B: 0.61149 | C: 0.61208 [LOGITS Ex2 A] Mean Abs: 2.220 | Max: 7.130 [LOSS Ex2] A: 0.10179 | B: 0.30152 | C: 0.20733 ** [JOINT LOSS] ** : 0.820923 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002528 | Grad Max: 0.061896 -> Layer: shared_layers.0.bias | Grad Mean: 0.191037 | Grad Max: 0.890903 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002266 | Grad Max: 0.005742 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000374 | Grad Max: 0.000374 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001346 | Grad Max: 0.374611 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024010 | Grad Max: 2.090009 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000113 | Grad Max: 0.005000 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009744 | Grad Max: 0.063004 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000216 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001862 | Grad Max: 0.005601 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000119 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000444 | Grad Max: 0.001684 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000390 | Grad Max: 0.001440 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006716 | Grad Max: 0.006716 [GRADIENT NORM TOTAL] 4.9425 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.123 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085858 0.4914142] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.082 [MASKS] A(Pass/Fail): 718/1330 | B: 609/1247 | C: 580/1468 [LOSS Ex1] A: 0.62752 | B: 0.61568 | C: 0.61075 [LOGITS Ex2 A] Mean Abs: 2.209 | Max: 8.798 [LOSS Ex2] A: 0.11303 | B: 0.29763 | C: 0.22910 ** [JOINT LOSS] ** : 0.831240 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003589 | Grad Max: 0.131314 -> Layer: shared_layers.0.bias | Grad Mean: 0.142613 | Grad Max: 0.521141 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006236 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005399 | Grad Max: 0.005399 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001327 | Grad Max: 0.317898 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023490 | Grad Max: 1.775632 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000129 | Grad Max: 0.004779 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010693 | Grad Max: 0.062502 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000241 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002230 | Grad Max: 0.005516 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000136 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000537 | Grad Max: 0.001974 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000365 | Grad Max: 0.001342 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007173 | Grad Max: 0.007173 [GRADIENT NORM TOTAL] 4.1686 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.930 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50124466 0.4987553 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.081 [MASKS] A(Pass/Fail): 688/1360 | B: 658/1390 | C: 564/1484 [LOSS Ex1] A: 0.63639 | B: 0.61534 | C: 0.61437 [LOGITS Ex2 A] Mean Abs: 2.179 | Max: 5.795 [LOSS Ex2] A: 0.10330 | B: 0.32384 | C: 0.22196 ** [JOINT LOSS] ** : 0.838400 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003217 | Grad Max: 0.105462 -> Layer: shared_layers.0.bias | Grad Mean: 0.190566 | Grad Max: 1.194301 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002045 | Grad Max: 0.005273 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004954 | Grad Max: 0.004954 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001314 | Grad Max: 0.544803 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023628 | Grad Max: 3.063382 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.005527 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008775 | Grad Max: 0.063388 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000281 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001866 | Grad Max: 0.006342 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000108 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000496 | Grad Max: 0.001550 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000350 | Grad Max: 0.001523 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008565 | Grad Max: 0.008565 [GRADIENT NORM TOTAL] 5.7033 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.834 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5434653 0.4565347] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.078 [MASKS] A(Pass/Fail): 688/1360 | B: 644/1404 | C: 531/1517 [LOSS Ex1] A: 0.63690 | B: 0.61582 | C: 0.61645 [LOGITS Ex2 A] Mean Abs: 2.206 | Max: 6.629 [LOSS Ex2] A: 0.10932 | B: 0.31452 | C: 0.22106 ** [JOINT LOSS] ** : 0.838024 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004125 | Grad Max: 0.153458 -> Layer: shared_layers.0.bias | Grad Mean: 0.138999 | Grad Max: 0.768375 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002010 | Grad Max: 0.005536 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007907 | Grad Max: 0.007907 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001015 | Grad Max: 0.586894 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016323 | Grad Max: 3.263661 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000054 | Grad Max: 0.002760 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002397 | Grad Max: 0.018473 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000155 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000380 | Grad Max: 0.002194 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000066 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000095 | Grad Max: 0.000781 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000374 | Grad Max: 0.001188 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001684 | Grad Max: 0.001684 [GRADIENT NORM TOTAL] 4.9136 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.043 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.80136406 0.19863589] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.083 [MASKS] A(Pass/Fail): 750/1298 | B: 649/1399 | C: 575/1473 [LOSS Ex1] A: 0.63044 | B: 0.61135 | C: 0.60607 [LOGITS Ex2 A] Mean Abs: 2.226 | Max: 7.261 [LOSS Ex2] A: 0.10038 | B: 0.29640 | C: 0.19476 ** [JOINT LOSS] ** : 0.813134 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003389 | Grad Max: 0.119697 -> Layer: shared_layers.0.bias | Grad Mean: 0.243384 | Grad Max: 1.336020 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002275 | Grad Max: 0.005903 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001879 | Grad Max: 0.001879 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001498 | Grad Max: 0.316151 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027422 | Grad Max: 1.755834 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.006507 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010783 | Grad Max: 0.058504 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000222 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002071 | Grad Max: 0.005862 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000135 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000541 | Grad Max: 0.001846 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000558 | Grad Max: 0.001689 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008467 | Grad Max: 0.008467 [GRADIENT NORM TOTAL] 5.6746 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 1.158 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50010705 0.49989295] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.082 [MASKS] A(Pass/Fail): 723/1325 | B: 609/1247 | C: 612/1436 [LOSS Ex1] A: 0.63754 | B: 0.61554 | C: 0.61087 [LOGITS Ex2 A] Mean Abs: 2.235 | Max: 5.301 [LOSS Ex2] A: 0.09857 | B: 0.30594 | C: 0.21067 ** [JOINT LOSS] ** : 0.826378 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003211 | Grad Max: 0.141695 -> Layer: shared_layers.0.bias | Grad Mean: 0.158788 | Grad Max: 0.656063 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.005467 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002540 | Grad Max: 0.002540 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001200 | Grad Max: 0.223541 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020388 | Grad Max: 1.211584 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000120 | Grad Max: 0.004502 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010153 | Grad Max: 0.056926 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000225 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002215 | Grad Max: 0.005648 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000137 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000544 | Grad Max: 0.001737 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000379 | Grad Max: 0.001582 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008367 | Grad Max: 0.008367 [GRADIENT NORM TOTAL] 3.9360 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.817 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7320677 0.2679323] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.083 [MASKS] A(Pass/Fail): 713/1335 | B: 658/1390 | C: 578/1470 [LOSS Ex1] A: 0.63280 | B: 0.61520 | C: 0.60724 [LOGITS Ex2 A] Mean Abs: 2.235 | Max: 7.355 [LOSS Ex2] A: 0.11019 | B: 0.31713 | C: 0.23515 ** [JOINT LOSS] ** : 0.839239 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002400 | Grad Max: 0.044911 -> Layer: shared_layers.0.bias | Grad Mean: 0.135205 | Grad Max: 0.633643 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.005690 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007344 | Grad Max: 0.007344 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000914 | Grad Max: 0.205855 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015678 | Grad Max: 1.158123 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.002977 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002946 | Grad Max: 0.027623 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000162 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000529 | Grad Max: 0.002917 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000117 | Grad Max: 0.000666 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000277 | Grad Max: 0.000809 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000212 | Grad Max: 0.000212 [GRADIENT NORM TOTAL] 3.5015 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 0.944 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63726795 0.362732 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.083 [MASKS] A(Pass/Fail): 590/1026 | B: 644/1404 | C: 412/964 [LOSS Ex1] A: 0.63094 | B: 0.61567 | C: 0.60456 [LOGITS Ex2 A] Mean Abs: 2.283 | Max: 8.866 [LOSS Ex2] A: 0.11252 | B: 0.31189 | C: 0.24651 ** [JOINT LOSS] ** : 0.840693 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003181 | Grad Max: 0.079816 -> Layer: shared_layers.0.bias | Grad Mean: 0.158554 | Grad Max: 0.697370 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.006488 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001968 | Grad Max: 0.001968 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001256 | Grad Max: 0.189290 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021664 | Grad Max: 1.059678 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000095 | Grad Max: 0.004629 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008042 | Grad Max: 0.046414 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000238 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001721 | Grad Max: 0.005154 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000163 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000365 | Grad Max: 0.001774 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000219 | Grad Max: 0.000967 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003701 | Grad Max: 0.003701 [GRADIENT NORM TOTAL] 3.8261 [EPOCH SUMMARY] Train Loss: 0.8320 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8149 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8174 -> New: 0.8149) ############################## EPOCH 154/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.160 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50810784 0.49189216] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.083 [MASKS] A(Pass/Fail): 723/1325 | B: 649/1399 | C: 610/1438 [LOSS Ex1] A: 0.63166 | B: 0.61119 | C: 0.60608 [LOGITS Ex2 A] Mean Abs: 2.233 | Max: 8.142 [LOSS Ex2] A: 0.09318 | B: 0.29514 | C: 0.23818 ** [JOINT LOSS] ** : 0.825142 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004907 | Grad Max: 0.179034 -> Layer: shared_layers.0.bias | Grad Mean: 0.307258 | Grad Max: 1.873354 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.005643 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006280 | Grad Max: 0.006280 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001960 | Grad Max: 0.517433 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034083 | Grad Max: 2.814525 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000175 | Grad Max: 0.007787 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014477 | Grad Max: 0.114202 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000238 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002428 | Grad Max: 0.005893 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000130 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000605 | Grad Max: 0.001880 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000368 | Grad Max: 0.001656 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010265 | Grad Max: 0.010265 [GRADIENT NORM TOTAL] 7.3468 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.095 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50721604 0.49278393] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 717/1331 | B: 610/1246 | C: 579/1469 [LOSS Ex1] A: 0.62834 | B: 0.61536 | C: 0.61028 [LOGITS Ex2 A] Mean Abs: 2.244 | Max: 5.819 [LOSS Ex2] A: 0.09987 | B: 0.30097 | C: 0.21229 ** [JOINT LOSS] ** : 0.822372 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004136 | Grad Max: 0.130517 -> Layer: shared_layers.0.bias | Grad Mean: 0.319797 | Grad Max: 1.728895 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002263 | Grad Max: 0.006532 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008431 | Grad Max: 0.008431 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002004 | Grad Max: 0.479807 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035531 | Grad Max: 2.643390 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000172 | Grad Max: 0.007256 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014995 | Grad Max: 0.095342 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000270 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002652 | Grad Max: 0.006156 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000150 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000650 | Grad Max: 0.002421 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000353 | Grad Max: 0.001693 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009251 | Grad Max: 0.009251 [GRADIENT NORM TOTAL] 7.3641 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.128 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50867635 0.49132368] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 717/1331 | B: 658/1390 | C: 611/1437 [LOSS Ex1] A: 0.62730 | B: 0.61502 | C: 0.60581 [LOGITS Ex2 A] Mean Abs: 2.235 | Max: 7.679 [LOSS Ex2] A: 0.12457 | B: 0.31843 | C: 0.22714 ** [JOINT LOSS] ** : 0.839425 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006673 | Grad Max: 0.240954 -> Layer: shared_layers.0.bias | Grad Mean: 0.300468 | Grad Max: 1.259275 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006034 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000119 | Grad Max: 0.000119 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002205 | Grad Max: 0.469604 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038965 | Grad Max: 2.610384 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.008933 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020853 | Grad Max: 0.116698 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000408 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004363 | Grad Max: 0.009394 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000234 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001065 | Grad Max: 0.003459 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000573 | Grad Max: 0.001656 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015278 | Grad Max: 0.015278 [GRADIENT NORM TOTAL] 6.9682 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.934 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50122535 0.4987746 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.082 [MASKS] A(Pass/Fail): 688/1360 | B: 645/1403 | C: 613/1435 [LOSS Ex1] A: 0.63618 | B: 0.61549 | C: 0.60391 [LOGITS Ex2 A] Mean Abs: 2.188 | Max: 7.339 [LOSS Ex2] A: 0.10779 | B: 0.31177 | C: 0.22155 ** [JOINT LOSS] ** : 0.832227 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002443 | Grad Max: 0.094536 -> Layer: shared_layers.0.bias | Grad Mean: 0.171083 | Grad Max: 1.109347 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.005823 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002673 | Grad Max: 0.002673 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001268 | Grad Max: 0.329060 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022721 | Grad Max: 1.809894 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.004336 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007314 | Grad Max: 0.052977 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000193 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001365 | Grad Max: 0.004297 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000104 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000342 | Grad Max: 0.001285 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000392 | Grad Max: 0.001189 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004928 | Grad Max: 0.004928 [GRADIENT NORM TOTAL] 4.7872 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.837 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5433875 0.4566126] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.079 [MASKS] A(Pass/Fail): 688/1360 | B: 648/1400 | C: 570/1478 [LOSS Ex1] A: 0.63669 | B: 0.61101 | C: 0.61257 [LOGITS Ex2 A] Mean Abs: 2.162 | Max: 6.213 [LOSS Ex2] A: 0.11639 | B: 0.29776 | C: 0.22956 ** [JOINT LOSS] ** : 0.834663 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009232 | Grad Max: 0.239306 -> Layer: shared_layers.0.bias | Grad Mean: 0.440128 | Grad Max: 1.798277 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.005403 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006408 | Grad Max: 0.006408 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003014 | Grad Max: 0.420667 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054643 | Grad Max: 2.241214 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000334 | Grad Max: 0.009176 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028420 | Grad Max: 0.127465 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000553 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005877 | Grad Max: 0.011899 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000286 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001413 | Grad Max: 0.004022 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000841 | Grad Max: 0.002615 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021207 | Grad Max: 0.021207 [GRADIENT NORM TOTAL] 9.2281 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.047 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.802384 0.197616] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.083 [MASKS] A(Pass/Fail): 751/1297 | B: 610/1246 | C: 577/1471 [LOSS Ex1] A: 0.63022 | B: 0.61518 | C: 0.61323 [LOGITS Ex2 A] Mean Abs: 2.233 | Max: 6.377 [LOSS Ex2] A: 0.10459 | B: 0.30057 | C: 0.24287 ** [JOINT LOSS] ** : 0.835557 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005432 | Grad Max: 0.145806 -> Layer: shared_layers.0.bias | Grad Mean: 0.228222 | Grad Max: 1.186232 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002120 | Grad Max: 0.005796 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000326 | Grad Max: 0.000326 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001680 | Grad Max: 0.450719 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030275 | Grad Max: 2.542480 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000169 | Grad Max: 0.006077 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014125 | Grad Max: 0.072628 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000332 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002929 | Grad Max: 0.007470 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000196 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000679 | Grad Max: 0.002267 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000424 | Grad Max: 0.001719 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010052 | Grad Max: 0.010052 [GRADIENT NORM TOTAL] 5.7096 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.164 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500144 0.499856] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.082 [MASKS] A(Pass/Fail): 723/1325 | B: 658/1390 | C: 551/1497 [LOSS Ex1] A: 0.63734 | B: 0.61486 | C: 0.61804 [LOGITS Ex2 A] Mean Abs: 2.273 | Max: 6.874 [LOSS Ex2] A: 0.09932 | B: 0.32649 | C: 0.22164 ** [JOINT LOSS] ** : 0.839230 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005339 | Grad Max: 0.211509 -> Layer: shared_layers.0.bias | Grad Mean: 0.542421 | Grad Max: 2.687800 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.005227 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002820 | Grad Max: 0.002820 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003384 | Grad Max: 0.488036 -> Layer: exit2_layers.0.bias | Grad Mean: 0.061306 | Grad Max: 2.701062 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000358 | Grad Max: 0.013952 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031693 | Grad Max: 0.181958 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000451 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005811 | Grad Max: 0.012057 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000257 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001437 | Grad Max: 0.004092 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000671 | Grad Max: 0.002125 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020726 | Grad Max: 0.020726 [GRADIENT NORM TOTAL] 11.6337 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.821 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.732795 0.26720503] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.083 [MASKS] A(Pass/Fail): 713/1335 | B: 645/1403 | C: 574/1474 [LOSS Ex1] A: 0.63258 | B: 0.61533 | C: 0.61097 [LOGITS Ex2 A] Mean Abs: 2.254 | Max: 6.764 [LOSS Ex2] A: 0.11830 | B: 0.32298 | C: 0.22988 ** [JOINT LOSS] ** : 0.843347 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004078 | Grad Max: 0.188459 -> Layer: shared_layers.0.bias | Grad Mean: 0.588772 | Grad Max: 2.593198 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.005783 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004448 | Grad Max: 0.004448 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003748 | Grad Max: 0.522929 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069835 | Grad Max: 2.850616 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000419 | Grad Max: 0.014427 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037669 | Grad Max: 0.211439 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000573 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007159 | Grad Max: 0.015290 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000360 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001724 | Grad Max: 0.005515 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000870 | Grad Max: 0.002282 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024827 | Grad Max: 0.024827 [GRADIENT NORM TOTAL] 12.9974 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 0.949 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6376168 0.36238316] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.083 [MASKS] A(Pass/Fail): 590/1026 | B: 648/1400 | C: 573/1475 [LOSS Ex1] A: 0.63073 | B: 0.61085 | C: 0.60540 [LOGITS Ex2 A] Mean Abs: 2.279 | Max: 9.904 [LOSS Ex2] A: 0.10638 | B: 0.29838 | C: 0.17797 ** [JOINT LOSS] ** : 0.809905 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002903 | Grad Max: 0.081961 -> Layer: shared_layers.0.bias | Grad Mean: 0.177503 | Grad Max: 0.815975 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002248 | Grad Max: 0.005863 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006974 | Grad Max: 0.006974 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001173 | Grad Max: 0.587875 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020416 | Grad Max: 3.241303 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000078 | Grad Max: 0.004467 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005761 | Grad Max: 0.045195 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000150 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000984 | Grad Max: 0.003775 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000089 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000294 | Grad Max: 0.001346 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000533 | Grad Max: 0.001369 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004814 | Grad Max: 0.004814 [GRADIENT NORM TOTAL] 5.5212 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.165 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081106 0.49188942] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 723/1325 | B: 610/1246 | C: 556/1492 [LOSS Ex1] A: 0.63146 | B: 0.61502 | C: 0.61488 [LOGITS Ex2 A] Mean Abs: 2.241 | Max: 7.543 [LOSS Ex2] A: 0.09698 | B: 0.31807 | C: 0.22791 ** [JOINT LOSS] ** : 0.834770 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004962 | Grad Max: 0.192225 -> Layer: shared_layers.0.bias | Grad Mean: 0.539744 | Grad Max: 2.630413 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.005871 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002233 | Grad Max: 0.002233 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003604 | Grad Max: 0.581572 -> Layer: exit2_layers.0.bias | Grad Mean: 0.067173 | Grad Max: 3.268096 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000412 | Grad Max: 0.016072 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036940 | Grad Max: 0.212451 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000589 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007100 | Grad Max: 0.014675 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000363 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001711 | Grad Max: 0.005453 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000895 | Grad Max: 0.002338 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024875 | Grad Max: 0.024875 [GRADIENT NORM TOTAL] 12.3253 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.099 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5071141 0.49288583] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 717/1331 | B: 658/1390 | C: 579/1469 [LOSS Ex1] A: 0.62815 | B: 0.61471 | C: 0.60569 [LOGITS Ex2 A] Mean Abs: 2.223 | Max: 6.229 [LOSS Ex2] A: 0.10157 | B: 0.33488 | C: 0.21793 ** [JOINT LOSS] ** : 0.834309 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003834 | Grad Max: 0.231708 -> Layer: shared_layers.0.bias | Grad Mean: 0.574105 | Grad Max: 3.107951 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.006314 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002247 | Grad Max: 0.002247 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003513 | Grad Max: 0.699370 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065378 | Grad Max: 3.891796 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.013023 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033134 | Grad Max: 0.180735 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000466 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006316 | Grad Max: 0.012958 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000298 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001545 | Grad Max: 0.004501 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000882 | Grad Max: 0.002227 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023916 | Grad Max: 0.023916 [GRADIENT NORM TOTAL] 13.0849 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.132 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50874317 0.49125683] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 717/1331 | B: 645/1403 | C: 575/1473 [LOSS Ex1] A: 0.62711 | B: 0.61519 | C: 0.60731 [LOGITS Ex2 A] Mean Abs: 2.241 | Max: 8.080 [LOSS Ex2] A: 0.11990 | B: 0.31718 | C: 0.21825 ** [JOINT LOSS] ** : 0.834980 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005817 | Grad Max: 0.249926 -> Layer: shared_layers.0.bias | Grad Mean: 0.142376 | Grad Max: 0.598485 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.005986 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001871 | Grad Max: 0.001871 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001299 | Grad Max: 0.299434 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021087 | Grad Max: 1.663070 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.003700 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004021 | Grad Max: 0.028785 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000251 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000959 | Grad Max: 0.003900 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000095 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000227 | Grad Max: 0.000965 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000293 | Grad Max: 0.000935 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002118 | Grad Max: 0.002118 [GRADIENT NORM TOTAL] 4.1260 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.939 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50118804 0.4988119 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.082 [MASKS] A(Pass/Fail): 688/1360 | B: 649/1399 | C: 572/1476 [LOSS Ex1] A: 0.63600 | B: 0.61071 | C: 0.61187 [LOGITS Ex2 A] Mean Abs: 2.230 | Max: 6.056 [LOSS Ex2] A: 0.10215 | B: 0.30055 | C: 0.23359 ** [JOINT LOSS] ** : 0.831623 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006685 | Grad Max: 0.220191 -> Layer: shared_layers.0.bias | Grad Mean: 0.482031 | Grad Max: 2.836068 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.005711 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007922 | Grad Max: 0.007922 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003125 | Grad Max: 0.632989 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057884 | Grad Max: 3.535125 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.011127 -> Layer: exit2_layers.3.bias | Grad Mean: 0.026817 | Grad Max: 0.148970 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000437 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005238 | Grad Max: 0.011527 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000245 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001245 | Grad Max: 0.003587 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000624 | Grad Max: 0.001970 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017532 | Grad Max: 0.017532 [GRADIENT NORM TOTAL] 11.5077 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.133 | Max: 0.841 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54326445 0.45673555] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.079 [MASKS] A(Pass/Fail): 688/1360 | B: 610/1246 | C: 404/972 [LOSS Ex1] A: 0.63653 | B: 0.61488 | C: 0.60757 [LOGITS Ex2 A] Mean Abs: 2.215 | Max: 6.657 [LOSS Ex2] A: 0.11360 | B: 0.30189 | C: 0.20725 ** [JOINT LOSS] ** : 0.827234 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003280 | Grad Max: 0.169777 -> Layer: shared_layers.0.bias | Grad Mean: 0.290022 | Grad Max: 1.600195 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.005221 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003304 | Grad Max: 0.003304 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001695 | Grad Max: 0.517957 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030945 | Grad Max: 2.861111 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000143 | Grad Max: 0.005271 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012547 | Grad Max: 0.079048 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000260 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002432 | Grad Max: 0.006344 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000158 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000632 | Grad Max: 0.002327 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000386 | Grad Max: 0.001473 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010206 | Grad Max: 0.010206 [GRADIENT NORM TOTAL] 7.1022 [EPOCH SUMMARY] Train Loss: 0.8318 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8154 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 155/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.052 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8032507 0.19674933] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.083 [MASKS] A(Pass/Fail): 751/1297 | B: 658/1390 | C: 572/1476 [LOSS Ex1] A: 0.63005 | B: 0.61458 | C: 0.61396 [LOGITS Ex2 A] Mean Abs: 2.229 | Max: 7.441 [LOSS Ex2] A: 0.10268 | B: 0.32564 | C: 0.22105 ** [JOINT LOSS] ** : 0.835985 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003638 | Grad Max: 0.087089 -> Layer: shared_layers.0.bias | Grad Mean: 0.292820 | Grad Max: 1.177554 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002114 | Grad Max: 0.006115 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000752 | Grad Max: 0.000752 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.254081 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038136 | Grad Max: 1.430156 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000237 | Grad Max: 0.010206 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020997 | Grad Max: 0.128432 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000360 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004195 | Grad Max: 0.009683 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000199 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001028 | Grad Max: 0.002901 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000634 | Grad Max: 0.001986 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016315 | Grad Max: 0.016315 [GRADIENT NORM TOTAL] 6.6018 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.169 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50015897 0.499841 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.082 [MASKS] A(Pass/Fail): 723/1325 | B: 645/1403 | C: 584/1464 [LOSS Ex1] A: 0.63717 | B: 0.61505 | C: 0.60820 [LOGITS Ex2 A] Mean Abs: 2.266 | Max: 5.662 [LOSS Ex2] A: 0.09818 | B: 0.31118 | C: 0.21265 ** [JOINT LOSS] ** : 0.827474 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005439 | Grad Max: 0.211160 -> Layer: shared_layers.0.bias | Grad Mean: 0.252182 | Grad Max: 1.000299 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.005357 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002137 | Grad Max: 0.002137 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001742 | Grad Max: 0.548626 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030757 | Grad Max: 3.087435 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000165 | Grad Max: 0.006139 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013917 | Grad Max: 0.075803 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000363 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002936 | Grad Max: 0.007983 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000212 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000707 | Grad Max: 0.002773 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000439 | Grad Max: 0.002082 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011086 | Grad Max: 0.011086 [GRADIENT NORM TOTAL] 6.6097 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.824 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7333982 0.2666018] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.083 [MASKS] A(Pass/Fail): 713/1335 | B: 649/1399 | C: 533/1515 [LOSS Ex1] A: 0.63241 | B: 0.61058 | C: 0.60915 [LOGITS Ex2 A] Mean Abs: 2.262 | Max: 8.318 [LOSS Ex2] A: 0.13271 | B: 0.29580 | C: 0.21822 ** [JOINT LOSS] ** : 0.832953 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004575 | Grad Max: 0.182534 -> Layer: shared_layers.0.bias | Grad Mean: 0.426985 | Grad Max: 2.410327 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002183 | Grad Max: 0.006053 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000070 | Grad Max: 0.000070 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002884 | Grad Max: 0.391845 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051904 | Grad Max: 2.172962 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000256 | Grad Max: 0.011261 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022518 | Grad Max: 0.138880 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000327 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004111 | Grad Max: 0.009119 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000196 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000994 | Grad Max: 0.002910 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000483 | Grad Max: 0.001686 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014699 | Grad Max: 0.014699 [GRADIENT NORM TOTAL] 9.6863 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 0.953 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63787 0.36212996] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.083 [MASKS] A(Pass/Fail): 590/1026 | B: 611/1245 | C: 596/1452 [LOSS Ex1] A: 0.63056 | B: 0.61475 | C: 0.60475 [LOGITS Ex2 A] Mean Abs: 2.327 | Max: 9.556 [LOSS Ex2] A: 0.09858 | B: 0.29763 | C: 0.21218 ** [JOINT LOSS] ** : 0.819481 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003497 | Grad Max: 0.130499 -> Layer: shared_layers.0.bias | Grad Mean: 0.280864 | Grad Max: 1.756748 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.005952 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008495 | Grad Max: 0.008495 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001985 | Grad Max: 0.297966 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036022 | Grad Max: 1.637293 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000193 | Grad Max: 0.008170 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017015 | Grad Max: 0.121691 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000239 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003145 | Grad Max: 0.007183 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000194 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000768 | Grad Max: 0.002654 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000387 | Grad Max: 0.001439 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010692 | Grad Max: 0.010692 [GRADIENT NORM TOTAL] 6.7581 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.171 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081099 0.49189013] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 723/1325 | B: 658/1390 | C: 580/1468 [LOSS Ex1] A: 0.63128 | B: 0.61445 | C: 0.61443 [LOGITS Ex2 A] Mean Abs: 2.281 | Max: 8.097 [LOSS Ex2] A: 0.09343 | B: 0.31853 | C: 0.22015 ** [JOINT LOSS] ** : 0.830757 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003544 | Grad Max: 0.156303 -> Layer: shared_layers.0.bias | Grad Mean: 0.390973 | Grad Max: 2.086279 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.005381 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001206 | Grad Max: 0.001206 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.768567 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041881 | Grad Max: 4.264465 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000206 | Grad Max: 0.008745 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018449 | Grad Max: 0.113282 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000299 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003534 | Grad Max: 0.007820 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000220 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000872 | Grad Max: 0.002803 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000463 | Grad Max: 0.001926 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013104 | Grad Max: 0.013104 [GRADIENT NORM TOTAL] 9.7484 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.104 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50702214 0.49297792] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 719/1329 | B: 645/1403 | C: 609/1439 [LOSS Ex1] A: 0.62797 | B: 0.61493 | C: 0.60272 [LOGITS Ex2 A] Mean Abs: 2.245 | Max: 6.932 [LOSS Ex2] A: 0.09959 | B: 0.31703 | C: 0.20947 ** [JOINT LOSS] ** : 0.823902 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003249 | Grad Max: 0.147978 -> Layer: shared_layers.0.bias | Grad Mean: 0.398789 | Grad Max: 1.955774 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002251 | Grad Max: 0.006003 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002188 | Grad Max: 0.002188 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002316 | Grad Max: 0.758862 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042626 | Grad Max: 4.192188 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000200 | Grad Max: 0.007596 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018011 | Grad Max: 0.107687 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000269 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003332 | Grad Max: 0.007098 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000191 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000819 | Grad Max: 0.002584 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000466 | Grad Max: 0.001815 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012975 | Grad Max: 0.012975 [GRADIENT NORM TOTAL] 9.5252 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.137 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50884104 0.491159 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 717/1331 | B: 651/1397 | C: 594/1454 [LOSS Ex1] A: 0.62693 | B: 0.61045 | C: 0.61067 [LOGITS Ex2 A] Mean Abs: 2.237 | Max: 6.742 [LOSS Ex2] A: 0.11114 | B: 0.29704 | C: 0.22446 ** [JOINT LOSS] ** : 0.826895 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005323 | Grad Max: 0.227302 -> Layer: shared_layers.0.bias | Grad Mean: 0.171922 | Grad Max: 0.801632 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002257 | Grad Max: 0.006344 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002723 | Grad Max: 0.002723 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001327 | Grad Max: 0.533285 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022401 | Grad Max: 2.949156 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000126 | Grad Max: 0.004143 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010300 | Grad Max: 0.050995 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000266 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002312 | Grad Max: 0.005969 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000142 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000549 | Grad Max: 0.001874 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000392 | Grad Max: 0.001374 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006607 | Grad Max: 0.006607 [GRADIENT NORM TOTAL] 4.8634 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.943 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50117105 0.49882892] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.082 [MASKS] A(Pass/Fail): 688/1360 | B: 611/1245 | C: 597/1451 [LOSS Ex1] A: 0.63582 | B: 0.61462 | C: 0.60848 [LOGITS Ex2 A] Mean Abs: 2.217 | Max: 6.255 [LOSS Ex2] A: 0.10551 | B: 0.29877 | C: 0.21173 ** [JOINT LOSS] ** : 0.824975 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002498 | Grad Max: 0.068261 -> Layer: shared_layers.0.bias | Grad Mean: 0.080727 | Grad Max: 0.460734 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.005506 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001818 | Grad Max: 0.001818 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000695 | Grad Max: 0.144976 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011767 | Grad Max: 0.811251 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002153 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002505 | Grad Max: 0.020263 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000118 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000409 | Grad Max: 0.002541 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000076 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000100 | Grad Max: 0.000737 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000371 | Grad Max: 0.001059 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000688 | Grad Max: 0.000688 [GRADIENT NORM TOTAL] 2.4131 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.844 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5432385 0.45676142] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.079 [MASKS] A(Pass/Fail): 689/1359 | B: 658/1390 | C: 589/1459 [LOSS Ex1] A: 0.63635 | B: 0.61433 | C: 0.60903 [LOGITS Ex2 A] Mean Abs: 2.204 | Max: 6.409 [LOSS Ex2] A: 0.10842 | B: 0.30872 | C: 0.20257 ** [JOINT LOSS] ** : 0.826473 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001711 | Grad Max: 0.032060 -> Layer: shared_layers.0.bias | Grad Mean: 0.053721 | Grad Max: 0.288284 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.005680 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006946 | Grad Max: 0.006946 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000616 | Grad Max: 0.149468 -> Layer: exit2_layers.0.bias | Grad Mean: 0.010676 | Grad Max: 0.827307 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002571 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002525 | Grad Max: 0.027773 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000142 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000436 | Grad Max: 0.002794 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000070 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000114 | Grad Max: 0.000796 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000414 | Grad Max: 0.001212 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001647 | Grad Max: 0.001647 [GRADIENT NORM TOTAL] 2.1344 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.056 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8041074 0.19589256] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.083 [MASKS] A(Pass/Fail): 752/1296 | B: 645/1403 | C: 592/1456 [LOSS Ex1] A: 0.62986 | B: 0.61478 | C: 0.61239 [LOGITS Ex2 A] Mean Abs: 2.268 | Max: 7.599 [LOSS Ex2] A: 0.10032 | B: 0.31902 | C: 0.23545 ** [JOINT LOSS] ** : 0.837274 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003864 | Grad Max: 0.167335 -> Layer: shared_layers.0.bias | Grad Mean: 0.406388 | Grad Max: 2.038413 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006002 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004451 | Grad Max: 0.004451 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002716 | Grad Max: 0.600284 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049787 | Grad Max: 3.356731 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000250 | Grad Max: 0.008543 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022564 | Grad Max: 0.115386 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000380 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004377 | Grad Max: 0.009684 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000229 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001042 | Grad Max: 0.003499 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000490 | Grad Max: 0.001596 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014230 | Grad Max: 0.014230 [GRADIENT NORM TOTAL] 10.2043 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 1.174 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50019974 0.49980026] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.082 [MASKS] A(Pass/Fail): 724/1324 | B: 651/1397 | C: 573/1475 [LOSS Ex1] A: 0.63699 | B: 0.61029 | C: 0.60703 [LOGITS Ex2 A] Mean Abs: 2.241 | Max: 5.846 [LOSS Ex2] A: 0.09568 | B: 0.29660 | C: 0.19256 ** [JOINT LOSS] ** : 0.813047 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005661 | Grad Max: 0.183811 -> Layer: shared_layers.0.bias | Grad Mean: 0.184458 | Grad Max: 0.916484 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002216 | Grad Max: 0.005442 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006021 | Grad Max: 0.006021 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001336 | Grad Max: 0.421721 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021870 | Grad Max: 2.341128 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000073 | Grad Max: 0.003779 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004013 | Grad Max: 0.038516 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000162 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000515 | Grad Max: 0.003445 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000081 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000170 | Grad Max: 0.001172 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000470 | Grad Max: 0.001248 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002367 | Grad Max: 0.002367 [GRADIENT NORM TOTAL] 4.8372 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.827 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7340357 0.26596433] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.083 [MASKS] A(Pass/Fail): 713/1335 | B: 611/1245 | C: 568/1480 [LOSS Ex1] A: 0.63221 | B: 0.61445 | C: 0.61287 [LOGITS Ex2 A] Mean Abs: 2.213 | Max: 6.403 [LOSS Ex2] A: 0.12734 | B: 0.30646 | C: 0.23252 ** [JOINT LOSS] ** : 0.841952 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009623 | Grad Max: 0.285598 -> Layer: shared_layers.0.bias | Grad Mean: 0.483274 | Grad Max: 2.210505 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002060 | Grad Max: 0.005524 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002583 | Grad Max: 0.002583 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003138 | Grad Max: 0.588366 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057397 | Grad Max: 3.289949 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.011082 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030860 | Grad Max: 0.152320 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000561 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006254 | Grad Max: 0.012745 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000319 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001554 | Grad Max: 0.004715 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000950 | Grad Max: 0.002473 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024593 | Grad Max: 0.024593 [GRADIENT NORM TOTAL] 10.2337 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 0.957 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6382151 0.36178482] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 590/1026 | B: 658/1390 | C: 562/1486 [LOSS Ex1] A: 0.63035 | B: 0.61416 | C: 0.60650 [LOGITS Ex2 A] Mean Abs: 2.303 | Max: 8.997 [LOSS Ex2] A: 0.09675 | B: 0.31804 | C: 0.21903 ** [JOINT LOSS] ** : 0.828275 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005641 | Grad Max: 0.155749 -> Layer: shared_layers.0.bias | Grad Mean: 0.334207 | Grad Max: 1.974810 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002241 | Grad Max: 0.007187 -> Layer: exit1_layers.0.bias | Grad Mean: 0.017303 | Grad Max: 0.017303 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.651462 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040975 | Grad Max: 3.647148 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000221 | Grad Max: 0.006781 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019164 | Grad Max: 0.106671 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000388 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003853 | Grad Max: 0.008399 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000199 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000979 | Grad Max: 0.002687 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000621 | Grad Max: 0.002183 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016178 | Grad Max: 0.016178 [GRADIENT NORM TOTAL] 8.6096 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.176 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081617 0.49183828] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 723/1325 | B: 646/1402 | C: 413/963 [LOSS Ex1] A: 0.63108 | B: 0.61461 | C: 0.60652 [LOGITS Ex2 A] Mean Abs: 2.317 | Max: 6.716 [LOSS Ex2] A: 0.09537 | B: 0.31761 | C: 0.24345 ** [JOINT LOSS] ** : 0.836214 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005853 | Grad Max: 0.182984 -> Layer: shared_layers.0.bias | Grad Mean: 0.558734 | Grad Max: 2.600561 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005471 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005405 | Grad Max: 0.005405 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003612 | Grad Max: 0.601803 -> Layer: exit2_layers.0.bias | Grad Mean: 0.066629 | Grad Max: 3.333735 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000408 | Grad Max: 0.012915 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036443 | Grad Max: 0.192689 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000534 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007002 | Grad Max: 0.015272 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000327 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001633 | Grad Max: 0.005284 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000784 | Grad Max: 0.002151 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021495 | Grad Max: 0.021495 [GRADIENT NORM TOTAL] 12.1369 [EPOCH SUMMARY] Train Loss: 0.8290 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8149 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 156/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.109 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50688756 0.49311242] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.084 [MASKS] A(Pass/Fail): 719/1329 | B: 652/1396 | C: 556/1492 [LOSS Ex1] A: 0.62776 | B: 0.61013 | C: 0.61131 [LOGITS Ex2 A] Mean Abs: 2.332 | Max: 5.771 [LOSS Ex2] A: 0.11014 | B: 0.29576 | C: 0.23358 ** [JOINT LOSS] ** : 0.829560 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008059 | Grad Max: 0.211411 -> Layer: shared_layers.0.bias | Grad Mean: 0.600612 | Grad Max: 2.759404 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002257 | Grad Max: 0.005898 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004734 | Grad Max: 0.004734 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004055 | Grad Max: 0.538892 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074584 | Grad Max: 2.994457 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000469 | Grad Max: 0.014972 -> Layer: exit2_layers.3.bias | Grad Mean: 0.041661 | Grad Max: 0.230978 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000680 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008280 | Grad Max: 0.017447 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000391 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001999 | Grad Max: 0.006133 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001055 | Grad Max: 0.002552 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028719 | Grad Max: 0.028719 [GRADIENT NORM TOTAL] 13.0186 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.142 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50898594 0.49101412] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 718/1330 | B: 613/1243 | C: 619/1429 [LOSS Ex1] A: 0.62671 | B: 0.61429 | C: 0.60355 [LOGITS Ex2 A] Mean Abs: 2.261 | Max: 8.543 [LOSS Ex2] A: 0.11378 | B: 0.30129 | C: 0.20796 ** [JOINT LOSS] ** : 0.822529 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005693 | Grad Max: 0.234105 -> Layer: shared_layers.0.bias | Grad Mean: 0.261290 | Grad Max: 1.575961 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002279 | Grad Max: 0.006474 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002572 | Grad Max: 0.002572 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001862 | Grad Max: 0.532848 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031199 | Grad Max: 2.982148 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000113 | Grad Max: 0.005720 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006833 | Grad Max: 0.068661 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000194 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000783 | Grad Max: 0.003726 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000081 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000196 | Grad Max: 0.001046 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000306 | Grad Max: 0.000882 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003044 | Grad Max: 0.003044 [GRADIENT NORM TOTAL] 6.6072 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.948 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50120294 0.498797 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.082 [MASKS] A(Pass/Fail): 688/1360 | B: 658/1390 | C: 565/1483 [LOSS Ex1] A: 0.63561 | B: 0.61401 | C: 0.60895 [LOGITS Ex2 A] Mean Abs: 2.216 | Max: 5.806 [LOSS Ex2] A: 0.09981 | B: 0.32937 | C: 0.21710 ** [JOINT LOSS] ** : 0.834953 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005126 | Grad Max: 0.251061 -> Layer: shared_layers.0.bias | Grad Mean: 0.650604 | Grad Max: 3.430507 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.005659 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006997 | Grad Max: 0.006997 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004276 | Grad Max: 0.868649 -> Layer: exit2_layers.0.bias | Grad Mean: 0.080192 | Grad Max: 4.801690 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000449 | Grad Max: 0.016006 -> Layer: exit2_layers.3.bias | Grad Mean: 0.040638 | Grad Max: 0.223652 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000536 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007676 | Grad Max: 0.015107 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000358 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001873 | Grad Max: 0.005591 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001035 | Grad Max: 0.002805 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028613 | Grad Max: 0.028613 [GRADIENT NORM TOTAL] 15.6169 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.847 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5431735 0.45682657] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.079 [MASKS] A(Pass/Fail): 689/1359 | B: 646/1402 | C: 600/1448 [LOSS Ex1] A: 0.63616 | B: 0.61447 | C: 0.60528 [LOGITS Ex2 A] Mean Abs: 2.239 | Max: 6.160 [LOSS Ex2] A: 0.10985 | B: 0.31334 | C: 0.21601 ** [JOINT LOSS] ** : 0.831705 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003461 | Grad Max: 0.073710 -> Layer: shared_layers.0.bias | Grad Mean: 0.259709 | Grad Max: 1.091612 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.005459 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004997 | Grad Max: 0.004997 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001721 | Grad Max: 0.658223 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031618 | Grad Max: 3.658653 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000173 | Grad Max: 0.005592 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015073 | Grad Max: 0.081881 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000272 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002934 | Grad Max: 0.006887 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000172 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000706 | Grad Max: 0.002315 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000386 | Grad Max: 0.001392 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010412 | Grad Max: 0.010412 [GRADIENT NORM TOTAL] 6.7809 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.061 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8050433 0.19495672] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.084 [MASKS] A(Pass/Fail): 752/1296 | B: 653/1395 | C: 604/1444 [LOSS Ex1] A: 0.62967 | B: 0.60999 | C: 0.60674 [LOGITS Ex2 A] Mean Abs: 2.303 | Max: 7.012 [LOSS Ex2] A: 0.09769 | B: 0.29205 | C: 0.19993 ** [JOINT LOSS] ** : 0.812024 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007354 | Grad Max: 0.218204 -> Layer: shared_layers.0.bias | Grad Mean: 0.543717 | Grad Max: 2.991340 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.005911 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003282 | Grad Max: 0.003282 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003494 | Grad Max: 0.578011 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064576 | Grad Max: 3.244424 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.012707 -> Layer: exit2_layers.3.bias | Grad Mean: 0.032545 | Grad Max: 0.176336 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000501 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006366 | Grad Max: 0.013386 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000290 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001520 | Grad Max: 0.004555 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000787 | Grad Max: 0.002397 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021948 | Grad Max: 0.021948 [GRADIENT NORM TOTAL] 12.3653 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.179 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002573 0.49974266] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.082 [MASKS] A(Pass/Fail): 724/1324 | B: 613/1243 | C: 602/1446 [LOSS Ex1] A: 0.63681 | B: 0.61415 | C: 0.60764 [LOGITS Ex2 A] Mean Abs: 2.327 | Max: 6.385 [LOSS Ex2] A: 0.10124 | B: 0.29914 | C: 0.21568 ** [JOINT LOSS] ** : 0.824888 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005474 | Grad Max: 0.218200 -> Layer: shared_layers.0.bias | Grad Mean: 0.535448 | Grad Max: 2.851196 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.005341 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000926 | Grad Max: 0.000926 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003438 | Grad Max: 0.676840 -> Layer: exit2_layers.0.bias | Grad Mean: 0.063721 | Grad Max: 3.781099 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000358 | Grad Max: 0.012076 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031949 | Grad Max: 0.174672 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000431 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005978 | Grad Max: 0.012167 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000301 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001443 | Grad Max: 0.004053 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000720 | Grad Max: 0.002171 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020681 | Grad Max: 0.020681 [GRADIENT NORM TOTAL] 12.4996 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.831 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.73459584 0.2654041 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.084 [MASKS] A(Pass/Fail): 713/1335 | B: 659/1389 | C: 568/1480 [LOSS Ex1] A: 0.63203 | B: 0.61388 | C: 0.61028 [LOGITS Ex2 A] Mean Abs: 2.250 | Max: 6.840 [LOSS Ex2] A: 0.10958 | B: 0.32030 | C: 0.21198 ** [JOINT LOSS] ** : 0.832684 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003921 | Grad Max: 0.173641 -> Layer: shared_layers.0.bias | Grad Mean: 0.135613 | Grad Max: 1.425563 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006072 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001867 | Grad Max: 0.001867 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001104 | Grad Max: 0.285587 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018190 | Grad Max: 1.577303 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000059 | Grad Max: 0.003269 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002647 | Grad Max: 0.024926 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000200 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000356 | Grad Max: 0.002118 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000073 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000098 | Grad Max: 0.000500 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000286 | Grad Max: 0.000986 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001312 | Grad Max: 0.001312 [GRADIENT NORM TOTAL] 3.9748 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6384724 0.36152765] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 590/1026 | B: 646/1402 | C: 570/1478 [LOSS Ex1] A: 0.63017 | B: 0.61434 | C: 0.60803 [LOGITS Ex2 A] Mean Abs: 2.293 | Max: 7.998 [LOSS Ex2] A: 0.10680 | B: 0.32315 | C: 0.22808 ** [JOINT LOSS] ** : 0.836857 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006226 | Grad Max: 0.178059 -> Layer: shared_layers.0.bias | Grad Mean: 0.523794 | Grad Max: 2.258018 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.006319 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011600 | Grad Max: 0.011600 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003439 | Grad Max: 0.379696 -> Layer: exit2_layers.0.bias | Grad Mean: 0.063211 | Grad Max: 2.142601 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.015163 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036239 | Grad Max: 0.199207 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000648 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007199 | Grad Max: 0.014787 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000326 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001805 | Grad Max: 0.004996 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001037 | Grad Max: 0.002579 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028568 | Grad Max: 0.028568 [GRADIENT NORM TOTAL] 10.8873 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.180 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081915 0.4918085] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 723/1325 | B: 653/1395 | C: 595/1453 [LOSS Ex1] A: 0.63090 | B: 0.60986 | C: 0.61104 [LOGITS Ex2 A] Mean Abs: 2.268 | Max: 10.175 [LOSS Ex2] A: 0.09636 | B: 0.29571 | C: 0.21892 ** [JOINT LOSS] ** : 0.820934 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002408 | Grad Max: 0.090016 -> Layer: shared_layers.0.bias | Grad Mean: 0.245006 | Grad Max: 1.189641 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.005180 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000566 | Grad Max: 0.000566 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001543 | Grad Max: 0.368607 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028031 | Grad Max: 2.042300 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.006671 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015296 | Grad Max: 0.090306 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000289 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002854 | Grad Max: 0.007775 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000178 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000675 | Grad Max: 0.002484 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000388 | Grad Max: 0.001676 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009969 | Grad Max: 0.009969 [GRADIENT NORM TOTAL] 5.4795 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.113 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5068304 0.4931696] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 720/1328 | B: 613/1243 | C: 584/1464 [LOSS Ex1] A: 0.62758 | B: 0.61402 | C: 0.60386 [LOGITS Ex2 A] Mean Abs: 2.316 | Max: 6.349 [LOSS Ex2] A: 0.11378 | B: 0.30906 | C: 0.21510 ** [JOINT LOSS] ** : 0.827801 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008334 | Grad Max: 0.219607 -> Layer: shared_layers.0.bias | Grad Mean: 0.561810 | Grad Max: 2.677314 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002308 | Grad Max: 0.006780 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008213 | Grad Max: 0.008213 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003760 | Grad Max: 0.477254 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069635 | Grad Max: 2.656878 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000411 | Grad Max: 0.014030 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036673 | Grad Max: 0.200695 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000539 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007236 | Grad Max: 0.015377 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000348 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001763 | Grad Max: 0.005096 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000955 | Grad Max: 0.002334 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026556 | Grad Max: 0.026556 [GRADIENT NORM TOTAL] 12.2492 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.147 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.509067 0.49093294] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.084 [MASKS] A(Pass/Fail): 718/1330 | B: 659/1389 | C: 613/1435 [LOSS Ex1] A: 0.62654 | B: 0.61376 | C: 0.60871 [LOGITS Ex2 A] Mean Abs: 2.290 | Max: 8.077 [LOSS Ex2] A: 0.12157 | B: 0.32530 | C: 0.22157 ** [JOINT LOSS] ** : 0.839150 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011774 | Grad Max: 0.323701 -> Layer: shared_layers.0.bias | Grad Mean: 0.655926 | Grad Max: 2.661777 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.006501 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000142 | Grad Max: 0.000142 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004558 | Grad Max: 0.545714 -> Layer: exit2_layers.0.bias | Grad Mean: 0.083343 | Grad Max: 3.035133 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000515 | Grad Max: 0.015925 -> Layer: exit2_layers.3.bias | Grad Mean: 0.045059 | Grad Max: 0.232359 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000776 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009091 | Grad Max: 0.019596 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000467 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002190 | Grad Max: 0.007352 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001139 | Grad Max: 0.002879 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031087 | Grad Max: 0.031087 [GRADIENT NORM TOTAL] 13.8341 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.951 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5011738 0.49882624] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.082 [MASKS] A(Pass/Fail): 688/1360 | B: 646/1402 | C: 601/1447 [LOSS Ex1] A: 0.63545 | B: 0.61422 | C: 0.60891 [LOGITS Ex2 A] Mean Abs: 2.216 | Max: 6.282 [LOSS Ex2] A: 0.09948 | B: 0.30808 | C: 0.22110 ** [JOINT LOSS] ** : 0.829082 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002644 | Grad Max: 0.082241 -> Layer: shared_layers.0.bias | Grad Mean: 0.101058 | Grad Max: 0.623551 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.005238 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004219 | Grad Max: 0.004219 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000850 | Grad Max: 0.182180 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014306 | Grad Max: 0.989350 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.003951 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003280 | Grad Max: 0.032803 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000154 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000482 | Grad Max: 0.002517 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000062 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000108 | Grad Max: 0.000605 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000372 | Grad Max: 0.001099 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002029 | Grad Max: 0.002029 [GRADIENT NORM TOTAL] 3.0079 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.134 | Max: 0.850 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5431843 0.45681572] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.079 [MASKS] A(Pass/Fail): 689/1359 | B: 653/1395 | C: 550/1498 [LOSS Ex1] A: 0.63600 | B: 0.60973 | C: 0.61148 [LOGITS Ex2 A] Mean Abs: 2.186 | Max: 5.419 [LOSS Ex2] A: 0.11032 | B: 0.29243 | C: 0.23453 ** [JOINT LOSS] ** : 0.831497 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005720 | Grad Max: 0.212417 -> Layer: shared_layers.0.bias | Grad Mean: 0.525648 | Grad Max: 2.686779 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.006293 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013398 | Grad Max: 0.013398 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003164 | Grad Max: 0.862336 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058702 | Grad Max: 4.769650 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000321 | Grad Max: 0.011005 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028937 | Grad Max: 0.161332 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000430 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005703 | Grad Max: 0.011032 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000313 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001421 | Grad Max: 0.004412 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000796 | Grad Max: 0.002438 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021910 | Grad Max: 0.021910 [GRADIENT NORM TOTAL] 12.5446 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.065 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.80583656 0.19416343] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.084 [MASKS] A(Pass/Fail): 752/1296 | B: 613/1243 | C: 357/1019 [LOSS Ex1] A: 0.62951 | B: 0.61390 | C: 0.61776 [LOGITS Ex2 A] Mean Abs: 2.230 | Max: 7.805 [LOSS Ex2] A: 0.09421 | B: 0.29981 | C: 0.26375 ** [JOINT LOSS] ** : 0.839641 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003555 | Grad Max: 0.187002 -> Layer: shared_layers.0.bias | Grad Mean: 0.379988 | Grad Max: 2.088233 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.005275 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001313 | Grad Max: 0.001313 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002154 | Grad Max: 0.634654 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039515 | Grad Max: 3.491164 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000215 | Grad Max: 0.009975 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019837 | Grad Max: 0.117367 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000027 | Grad Max: 0.000324 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003896 | Grad Max: 0.008542 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000211 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001011 | Grad Max: 0.003058 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000572 | Grad Max: 0.001692 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016790 | Grad Max: 0.016790 [GRADIENT NORM TOTAL] 8.6935 [EPOCH SUMMARY] Train Loss: 0.8295 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8127 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8149 -> New: 0.8127) ############################## EPOCH 157/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.183 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002708 0.4997292] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.083 [MASKS] A(Pass/Fail): 724/1324 | B: 659/1389 | C: 586/1462 [LOSS Ex1] A: 0.63665 | B: 0.61364 | C: 0.60593 [LOGITS Ex2 A] Mean Abs: 2.286 | Max: 7.116 [LOSS Ex2] A: 0.08777 | B: 0.31969 | C: 0.21251 ** [JOINT LOSS] ** : 0.825399 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003548 | Grad Max: 0.140361 -> Layer: shared_layers.0.bias | Grad Mean: 0.383234 | Grad Max: 1.779468 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005256 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002983 | Grad Max: 0.002983 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002660 | Grad Max: 0.368186 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048400 | Grad Max: 2.041661 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000275 | Grad Max: 0.010165 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024782 | Grad Max: 0.141471 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000381 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004527 | Grad Max: 0.009597 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000210 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001101 | Grad Max: 0.003207 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000515 | Grad Max: 0.001836 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015978 | Grad Max: 0.015978 [GRADIENT NORM TOTAL] 8.6718 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.834 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.73514456 0.26485547] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.084 [MASKS] A(Pass/Fail): 714/1334 | B: 646/1402 | C: 584/1464 [LOSS Ex1] A: 0.63186 | B: 0.61409 | C: 0.60506 [LOGITS Ex2 A] Mean Abs: 2.243 | Max: 6.947 [LOSS Ex2] A: 0.12214 | B: 0.31729 | C: 0.22134 ** [JOINT LOSS] ** : 0.837260 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004434 | Grad Max: 0.172871 -> Layer: shared_layers.0.bias | Grad Mean: 0.362127 | Grad Max: 2.199117 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.006012 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010368 | Grad Max: 0.010368 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002413 | Grad Max: 0.432402 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043339 | Grad Max: 2.403100 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000208 | Grad Max: 0.010333 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018629 | Grad Max: 0.114446 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000283 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003411 | Grad Max: 0.007866 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000147 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000812 | Grad Max: 0.002231 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000370 | Grad Max: 0.001445 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011374 | Grad Max: 0.011374 [GRADIENT NORM TOTAL] 8.5838 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 0.965 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6387481 0.3612519] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.084 [MASKS] A(Pass/Fail): 590/1026 | B: 653/1395 | C: 558/1490 [LOSS Ex1] A: 0.63000 | B: 0.60961 | C: 0.61023 [LOGITS Ex2 A] Mean Abs: 2.246 | Max: 8.015 [LOSS Ex2] A: 0.10292 | B: 0.29444 | C: 0.23253 ** [JOINT LOSS] ** : 0.826576 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004841 | Grad Max: 0.144939 -> Layer: shared_layers.0.bias | Grad Mean: 0.175856 | Grad Max: 1.195758 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.005732 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008023 | Grad Max: 0.008023 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001320 | Grad Max: 0.168387 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023308 | Grad Max: 0.929998 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000137 | Grad Max: 0.006574 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011883 | Grad Max: 0.073321 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000277 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002488 | Grad Max: 0.006227 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000174 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000636 | Grad Max: 0.002288 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000402 | Grad Max: 0.001633 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010931 | Grad Max: 0.010931 [GRADIENT NORM TOTAL] 4.0239 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.185 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081894 0.49181062] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 723/1325 | B: 613/1243 | C: 596/1452 [LOSS Ex1] A: 0.63073 | B: 0.61377 | C: 0.60632 [LOGITS Ex2 A] Mean Abs: 2.250 | Max: 8.729 [LOSS Ex2] A: 0.09933 | B: 0.30300 | C: 0.21349 ** [JOINT LOSS] ** : 0.822211 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002797 | Grad Max: 0.094142 -> Layer: shared_layers.0.bias | Grad Mean: 0.102614 | Grad Max: 0.370398 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.006106 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006131 | Grad Max: 0.006131 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000713 | Grad Max: 0.190896 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011995 | Grad Max: 1.049854 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.002305 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002162 | Grad Max: 0.019623 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000132 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000383 | Grad Max: 0.002813 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000066 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000100 | Grad Max: 0.000552 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000372 | Grad Max: 0.001076 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000884 | Grad Max: 0.000884 [GRADIENT NORM TOTAL] 2.7491 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.117 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50675005 0.4932499 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 720/1328 | B: 659/1389 | C: 584/1464 [LOSS Ex1] A: 0.62740 | B: 0.61351 | C: 0.60871 [LOGITS Ex2 A] Mean Abs: 2.276 | Max: 6.059 [LOSS Ex2] A: 0.10216 | B: 0.31654 | C: 0.20476 ** [JOINT LOSS] ** : 0.824360 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003986 | Grad Max: 0.119048 -> Layer: shared_layers.0.bias | Grad Mean: 0.169245 | Grad Max: 1.186606 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.005931 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000338 | Grad Max: 0.000338 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001215 | Grad Max: 0.335827 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021688 | Grad Max: 1.839101 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.004651 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010833 | Grad Max: 0.064112 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000256 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002211 | Grad Max: 0.005904 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000121 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000536 | Grad Max: 0.001595 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000364 | Grad Max: 0.001300 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007469 | Grad Max: 0.007469 [GRADIENT NORM TOTAL] 4.2067 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.151 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5091447 0.49085534] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.564 | Std: 0.084 [MASKS] A(Pass/Fail): 721/1327 | B: 646/1402 | C: 643/1405 [LOSS Ex1] A: 0.62635 | B: 0.61395 | C: 0.60533 [LOGITS Ex2 A] Mean Abs: 2.218 | Max: 7.452 [LOSS Ex2] A: 0.11976 | B: 0.31604 | C: 0.21367 ** [JOINT LOSS] ** : 0.831704 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003641 | Grad Max: 0.125250 -> Layer: shared_layers.0.bias | Grad Mean: 0.282281 | Grad Max: 1.681771 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.006529 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001399 | Grad Max: 0.001399 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001846 | Grad Max: 0.514216 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033046 | Grad Max: 2.821452 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000152 | Grad Max: 0.006820 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013512 | Grad Max: 0.096123 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000218 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002339 | Grad Max: 0.006260 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000130 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000555 | Grad Max: 0.001826 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000323 | Grad Max: 0.001320 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008709 | Grad Max: 0.008709 [GRADIENT NORM TOTAL] 7.0020 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.955 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50114894 0.4988511 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.561 | Std: 0.083 [MASKS] A(Pass/Fail): 688/1360 | B: 653/1395 | C: 564/1484 [LOSS Ex1] A: 0.63526 | B: 0.60945 | C: 0.60839 [LOGITS Ex2 A] Mean Abs: 2.197 | Max: 5.816 [LOSS Ex2] A: 0.09685 | B: 0.28676 | C: 0.22880 ** [JOINT LOSS] ** : 0.821838 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003316 | Grad Max: 0.091841 -> Layer: shared_layers.0.bias | Grad Mean: 0.169468 | Grad Max: 0.851953 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005509 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001600 | Grad Max: 0.001600 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001138 | Grad Max: 0.443816 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019673 | Grad Max: 2.466603 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000072 | Grad Max: 0.004295 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005639 | Grad Max: 0.046831 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000181 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000935 | Grad Max: 0.004192 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000078 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000211 | Grad Max: 0.000829 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000423 | Grad Max: 0.001189 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003008 | Grad Max: 0.003008 [GRADIENT NORM TOTAL] 4.7809 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.854 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5431174 0.45688257] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.080 [MASKS] A(Pass/Fail): 691/1357 | B: 613/1243 | C: 598/1450 [LOSS Ex1] A: 0.63582 | B: 0.61360 | C: 0.60975 [LOGITS Ex2 A] Mean Abs: 2.213 | Max: 6.007 [LOSS Ex2] A: 0.10423 | B: 0.29529 | C: 0.21326 ** [JOINT LOSS] ** : 0.823986 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003320 | Grad Max: 0.118124 -> Layer: shared_layers.0.bias | Grad Mean: 0.238981 | Grad Max: 1.072875 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.005439 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002876 | Grad Max: 0.002876 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.293769 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026104 | Grad Max: 1.603777 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000132 | Grad Max: 0.006414 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011597 | Grad Max: 0.074806 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000240 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002261 | Grad Max: 0.006043 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000131 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000596 | Grad Max: 0.001918 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000399 | Grad Max: 0.001551 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009142 | Grad Max: 0.009142 [GRADIENT NORM TOTAL] 5.4004 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.069 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.806801 0.19319896] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.084 [MASKS] A(Pass/Fail): 752/1296 | B: 659/1389 | C: 572/1476 [LOSS Ex1] A: 0.62932 | B: 0.61334 | C: 0.61028 [LOGITS Ex2 A] Mean Abs: 2.226 | Max: 6.437 [LOSS Ex2] A: 0.10542 | B: 0.32074 | C: 0.21449 ** [JOINT LOSS] ** : 0.831196 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002352 | Grad Max: 0.069171 -> Layer: shared_layers.0.bias | Grad Mean: 0.143298 | Grad Max: 0.944270 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.005597 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000119 | Grad Max: 0.000119 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000951 | Grad Max: 0.234409 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016712 | Grad Max: 1.302522 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000063 | Grad Max: 0.004535 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005027 | Grad Max: 0.042310 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000175 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000908 | Grad Max: 0.003686 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000067 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000202 | Grad Max: 0.000907 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.000969 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002029 | Grad Max: 0.002029 [GRADIENT NORM TOTAL] 3.5866 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.188 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50026906 0.4997309 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.083 [MASKS] A(Pass/Fail): 724/1324 | B: 646/1402 | C: 565/1483 [LOSS Ex1] A: 0.63647 | B: 0.61378 | C: 0.60658 [LOGITS Ex2 A] Mean Abs: 2.221 | Max: 6.186 [LOSS Ex2] A: 0.08588 | B: 0.31946 | C: 0.21392 ** [JOINT LOSS] ** : 0.825362 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005208 | Grad Max: 0.135971 -> Layer: shared_layers.0.bias | Grad Mean: 0.387676 | Grad Max: 1.883190 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.005237 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000783 | Grad Max: 0.000783 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002561 | Grad Max: 0.765227 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046841 | Grad Max: 4.253297 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000269 | Grad Max: 0.008995 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023945 | Grad Max: 0.124429 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000035 | Grad Max: 0.000449 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004771 | Grad Max: 0.011682 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000263 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001133 | Grad Max: 0.003856 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000626 | Grad Max: 0.002614 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016978 | Grad Max: 0.016978 [GRADIENT NORM TOTAL] 9.6201 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.838 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.735948 0.26405194] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.084 [MASKS] A(Pass/Fail): 714/1334 | B: 653/1395 | C: 574/1474 [LOSS Ex1] A: 0.63166 | B: 0.60928 | C: 0.60731 [LOGITS Ex2 A] Mean Abs: 2.215 | Max: 7.002 [LOSS Ex2] A: 0.11554 | B: 0.29528 | C: 0.23121 ** [JOINT LOSS] ** : 0.830090 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006661 | Grad Max: 0.175301 -> Layer: shared_layers.0.bias | Grad Mean: 0.302390 | Grad Max: 1.223943 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002142 | Grad Max: 0.005913 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004426 | Grad Max: 0.004426 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001957 | Grad Max: 0.724845 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035436 | Grad Max: 4.006245 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000232 | Grad Max: 0.008081 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019846 | Grad Max: 0.103750 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000393 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003976 | Grad Max: 0.009347 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000238 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000956 | Grad Max: 0.003162 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000562 | Grad Max: 0.002047 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014270 | Grad Max: 0.014270 [GRADIENT NORM TOTAL] 7.3235 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.970 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6391472 0.3608528] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.084 [MASKS] A(Pass/Fail): 590/1026 | B: 613/1243 | C: 612/1436 [LOSS Ex1] A: 0.62979 | B: 0.61342 | C: 0.60841 [LOGITS Ex2 A] Mean Abs: 2.299 | Max: 11.122 [LOSS Ex2] A: 0.10873 | B: 0.29584 | C: 0.21830 ** [JOINT LOSS] ** : 0.824829 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003752 | Grad Max: 0.221480 -> Layer: shared_layers.0.bias | Grad Mean: 0.444011 | Grad Max: 2.916276 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.006012 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002571 | Grad Max: 0.002571 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002843 | Grad Max: 0.493130 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052173 | Grad Max: 2.703733 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000282 | Grad Max: 0.011074 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025832 | Grad Max: 0.148893 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000382 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004822 | Grad Max: 0.010703 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000199 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001160 | Grad Max: 0.003139 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000538 | Grad Max: 0.001970 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016630 | Grad Max: 0.016630 [GRADIENT NORM TOTAL] 10.3892 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.190 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50814945 0.49185058] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 723/1325 | B: 659/1389 | C: 593/1455 [LOSS Ex1] A: 0.63052 | B: 0.61318 | C: 0.61043 [LOGITS Ex2 A] Mean Abs: 2.296 | Max: 10.431 [LOSS Ex2] A: 0.09531 | B: 0.31850 | C: 0.22236 ** [JOINT LOSS] ** : 0.830102 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004413 | Grad Max: 0.213261 -> Layer: shared_layers.0.bias | Grad Mean: 0.551957 | Grad Max: 2.872541 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.005511 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001245 | Grad Max: 0.001245 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003729 | Grad Max: 0.557624 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069232 | Grad Max: 3.092756 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.014267 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036689 | Grad Max: 0.206255 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000551 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007050 | Grad Max: 0.014467 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000311 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001708 | Grad Max: 0.004858 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000793 | Grad Max: 0.002264 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024078 | Grad Max: 0.024078 [GRADIENT NORM TOTAL] 12.7830 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.123 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5067339 0.4932661] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 720/1328 | B: 646/1402 | C: 379/997 [LOSS Ex1] A: 0.62719 | B: 0.61362 | C: 0.61245 [LOGITS Ex2 A] Mean Abs: 2.247 | Max: 6.263 [LOSS Ex2] A: 0.10232 | B: 0.30991 | C: 0.22495 ** [JOINT LOSS] ** : 0.830150 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002447 | Grad Max: 0.045752 -> Layer: shared_layers.0.bias | Grad Mean: 0.134412 | Grad Max: 0.557344 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005698 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002716 | Grad Max: 0.002716 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000914 | Grad Max: 0.413251 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016031 | Grad Max: 2.313596 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.003904 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002599 | Grad Max: 0.023597 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000130 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000451 | Grad Max: 0.002933 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000075 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000117 | Grad Max: 0.000634 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000353 | Grad Max: 0.001229 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001782 | Grad Max: 0.001782 [GRADIENT NORM TOTAL] 4.1409 [EPOCH SUMMARY] Train Loss: 0.8275 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8114 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8127 -> New: 0.8114) ############################## EPOCH 158/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.157 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50914294 0.4908571 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 721/1327 | B: 653/1395 | C: 588/1460 [LOSS Ex1] A: 0.62615 | B: 0.60912 | C: 0.60741 [LOGITS Ex2 A] Mean Abs: 2.211 | Max: 7.056 [LOSS Ex2] A: 0.10545 | B: 0.29204 | C: 0.22381 ** [JOINT LOSS] ** : 0.821325 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003771 | Grad Max: 0.151879 -> Layer: shared_layers.0.bias | Grad Mean: 0.400645 | Grad Max: 1.956825 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.005781 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002073 | Grad Max: 0.002073 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002361 | Grad Max: 0.439429 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042937 | Grad Max: 2.398978 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000233 | Grad Max: 0.009973 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020880 | Grad Max: 0.135823 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000344 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003888 | Grad Max: 0.008899 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000979 | Grad Max: 0.003428 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000535 | Grad Max: 0.002000 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015526 | Grad Max: 0.015526 [GRADIENT NORM TOTAL] 8.4697 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.959 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50102895 0.49897102] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.083 [MASKS] A(Pass/Fail): 688/1360 | B: 613/1243 | C: 568/1480 [LOSS Ex1] A: 0.63507 | B: 0.61326 | C: 0.61060 [LOGITS Ex2 A] Mean Abs: 2.215 | Max: 7.109 [LOSS Ex2] A: 0.10196 | B: 0.30206 | C: 0.20442 ** [JOINT LOSS] ** : 0.822457 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002495 | Grad Max: 0.084145 -> Layer: shared_layers.0.bias | Grad Mean: 0.267379 | Grad Max: 1.201651 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002076 | Grad Max: 0.005610 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000123 | Grad Max: 0.000123 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001658 | Grad Max: 0.260653 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030078 | Grad Max: 1.447896 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000166 | Grad Max: 0.006663 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015346 | Grad Max: 0.084945 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000303 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002935 | Grad Max: 0.007499 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000170 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000673 | Grad Max: 0.002570 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000409 | Grad Max: 0.001732 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009611 | Grad Max: 0.009611 [GRADIENT NORM TOTAL] 5.5780 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.858 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5431274 0.4568726] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.080 [MASKS] A(Pass/Fail): 691/1357 | B: 659/1389 | C: 618/1430 [LOSS Ex1] A: 0.63564 | B: 0.61303 | C: 0.60530 [LOGITS Ex2 A] Mean Abs: 2.244 | Max: 6.291 [LOSS Ex2] A: 0.11414 | B: 0.31280 | C: 0.22304 ** [JOINT LOSS] ** : 0.834648 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005289 | Grad Max: 0.214770 -> Layer: shared_layers.0.bias | Grad Mean: 0.488552 | Grad Max: 2.295689 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.005465 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007206 | Grad Max: 0.007206 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003130 | Grad Max: 0.654056 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057905 | Grad Max: 3.645379 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000320 | Grad Max: 0.011657 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029062 | Grad Max: 0.165547 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000452 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005711 | Grad Max: 0.012382 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000265 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001379 | Grad Max: 0.004056 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000666 | Grad Max: 0.002162 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019393 | Grad Max: 0.019393 [GRADIENT NORM TOTAL] 11.1332 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.075 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8077895 0.19221045] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.084 [MASKS] A(Pass/Fail): 751/1297 | B: 646/1402 | C: 579/1469 [LOSS Ex1] A: 0.62912 | B: 0.61347 | C: 0.60858 [LOGITS Ex2 A] Mean Abs: 2.281 | Max: 6.782 [LOSS Ex2] A: 0.10444 | B: 0.31587 | C: 0.23947 ** [JOINT LOSS] ** : 0.836984 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007527 | Grad Max: 0.251714 -> Layer: shared_layers.0.bias | Grad Mean: 0.641844 | Grad Max: 3.093090 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.006239 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001708 | Grad Max: 0.001708 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004001 | Grad Max: 0.733402 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074255 | Grad Max: 4.082465 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000431 | Grad Max: 0.015104 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039133 | Grad Max: 0.220290 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000574 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007632 | Grad Max: 0.015123 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000368 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001882 | Grad Max: 0.005972 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000933 | Grad Max: 0.002419 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027361 | Grad Max: 0.027361 [GRADIENT NORM TOTAL] 14.4293 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.194 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002435 0.4997565] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 724/1324 | B: 653/1395 | C: 557/1491 [LOSS Ex1] A: 0.63628 | B: 0.60897 | C: 0.61141 [LOGITS Ex2 A] Mean Abs: 2.278 | Max: 6.099 [LOSS Ex2] A: 0.09536 | B: 0.29520 | C: 0.21655 ** [JOINT LOSS] ** : 0.821258 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002754 | Grad Max: 0.058498 -> Layer: shared_layers.0.bias | Grad Mean: 0.176598 | Grad Max: 0.696940 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005120 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000873 | Grad Max: 0.000873 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001191 | Grad Max: 0.252388 -> Layer: exit2_layers.0.bias | Grad Mean: 0.021617 | Grad Max: 1.394597 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.004476 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008277 | Grad Max: 0.051339 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000198 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001449 | Grad Max: 0.004702 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000099 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000355 | Grad Max: 0.001261 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000390 | Grad Max: 0.001132 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004502 | Grad Max: 0.004502 [GRADIENT NORM TOTAL] 4.3870 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.842 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.73661226 0.2633877 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.084 [MASKS] A(Pass/Fail): 714/1334 | B: 613/1243 | C: 604/1444 [LOSS Ex1] A: 0.63146 | B: 0.61312 | C: 0.60376 [LOGITS Ex2 A] Mean Abs: 2.211 | Max: 6.612 [LOSS Ex2] A: 0.11359 | B: 0.31837 | C: 0.24034 ** [JOINT LOSS] ** : 0.840213 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009207 | Grad Max: 0.272457 -> Layer: shared_layers.0.bias | Grad Mean: 0.815414 | Grad Max: 3.618618 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.005526 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003296 | Grad Max: 0.003296 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005435 | Grad Max: 0.604844 -> Layer: exit2_layers.0.bias | Grad Mean: 0.101298 | Grad Max: 3.349294 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000618 | Grad Max: 0.020649 -> Layer: exit2_layers.3.bias | Grad Mean: 0.056483 | Grad Max: 0.305221 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000804 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010939 | Grad Max: 0.020917 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000507 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002638 | Grad Max: 0.008173 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001417 | Grad Max: 0.003328 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039494 | Grad Max: 0.039494 [GRADIENT NORM TOTAL] 17.4737 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.974 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63945895 0.36054105] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.084 [MASKS] A(Pass/Fail): 590/1026 | B: 659/1389 | C: 586/1462 [LOSS Ex1] A: 0.62960 | B: 0.61290 | C: 0.60443 [LOGITS Ex2 A] Mean Abs: 2.245 | Max: 8.402 [LOSS Ex2] A: 0.10210 | B: 0.34827 | C: 0.20398 ** [JOINT LOSS] ** : 0.833762 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011678 | Grad Max: 0.337708 -> Layer: shared_layers.0.bias | Grad Mean: 0.955025 | Grad Max: 4.566785 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.006261 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006318 | Grad Max: 0.006318 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006428 | Grad Max: 1.104421 -> Layer: exit2_layers.0.bias | Grad Mean: 0.119485 | Grad Max: 6.121582 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000692 | Grad Max: 0.022184 -> Layer: exit2_layers.3.bias | Grad Mean: 0.063123 | Grad Max: 0.332739 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.000838 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012291 | Grad Max: 0.024131 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000614 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002954 | Grad Max: 0.009129 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001546 | Grad Max: 0.003932 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043277 | Grad Max: 0.043277 [GRADIENT NORM TOTAL] 21.8891 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.196 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081215 0.4918785] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 723/1325 | B: 646/1402 | C: 590/1458 [LOSS Ex1] A: 0.63033 | B: 0.61334 | C: 0.60682 [LOGITS Ex2 A] Mean Abs: 2.263 | Max: 7.230 [LOSS Ex2] A: 0.09322 | B: 0.31631 | C: 0.22672 ** [JOINT LOSS] ** : 0.828916 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004649 | Grad Max: 0.187035 -> Layer: shared_layers.0.bias | Grad Mean: 0.415648 | Grad Max: 2.470083 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.005208 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003646 | Grad Max: 0.003646 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002643 | Grad Max: 0.802328 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049127 | Grad Max: 4.443995 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000255 | Grad Max: 0.009740 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023180 | Grad Max: 0.136063 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000365 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004383 | Grad Max: 0.009321 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000191 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001053 | Grad Max: 0.002979 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000598 | Grad Max: 0.002050 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016386 | Grad Max: 0.016386 [GRADIENT NORM TOTAL] 10.4621 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.127 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5066513 0.49334872] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 720/1328 | B: 654/1394 | C: 606/1442 [LOSS Ex1] A: 0.62700 | B: 0.60884 | C: 0.60829 [LOGITS Ex2 A] Mean Abs: 2.310 | Max: 6.360 [LOSS Ex2] A: 0.10348 | B: 0.30700 | C: 0.21884 ** [JOINT LOSS] ** : 0.824483 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007352 | Grad Max: 0.236437 -> Layer: shared_layers.0.bias | Grad Mean: 0.638477 | Grad Max: 2.993062 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002256 | Grad Max: 0.005853 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003998 | Grad Max: 0.003998 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004274 | Grad Max: 0.512546 -> Layer: exit2_layers.0.bias | Grad Mean: 0.079345 | Grad Max: 2.855659 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000498 | Grad Max: 0.016363 -> Layer: exit2_layers.3.bias | Grad Mean: 0.045366 | Grad Max: 0.231990 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000707 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008836 | Grad Max: 0.018571 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000411 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002138 | Grad Max: 0.006699 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001069 | Grad Max: 0.002918 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030740 | Grad Max: 0.030740 [GRADIENT NORM TOTAL] 13.7294 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.161 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.509246 0.49075398] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 721/1327 | B: 613/1243 | C: 586/1462 [LOSS Ex1] A: 0.62596 | B: 0.61299 | C: 0.60559 [LOGITS Ex2 A] Mean Abs: 2.317 | Max: 7.271 [LOSS Ex2] A: 0.12116 | B: 0.31034 | C: 0.21593 ** [JOINT LOSS] ** : 0.830661 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012962 | Grad Max: 0.312729 -> Layer: shared_layers.0.bias | Grad Mean: 0.815796 | Grad Max: 3.508162 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002231 | Grad Max: 0.006880 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001745 | Grad Max: 0.001745 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005482 | Grad Max: 0.678071 -> Layer: exit2_layers.0.bias | Grad Mean: 0.100227 | Grad Max: 3.719947 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000638 | Grad Max: 0.020359 -> Layer: exit2_layers.3.bias | Grad Mean: 0.057517 | Grad Max: 0.293532 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.000917 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011532 | Grad Max: 0.022435 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000525 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002863 | Grad Max: 0.008365 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001492 | Grad Max: 0.003633 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042715 | Grad Max: 0.042715 [GRADIENT NORM TOTAL] 17.2773 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.963 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50102365 0.49897632] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.083 [MASKS] A(Pass/Fail): 688/1360 | B: 659/1389 | C: 591/1457 [LOSS Ex1] A: 0.63489 | B: 0.61278 | C: 0.60411 [LOGITS Ex2 A] Mean Abs: 2.242 | Max: 5.597 [LOSS Ex2] A: 0.09617 | B: 0.31125 | C: 0.21797 ** [JOINT LOSS] ** : 0.825727 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004017 | Grad Max: 0.090805 -> Layer: shared_layers.0.bias | Grad Mean: 0.220542 | Grad Max: 1.057100 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.006229 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011158 | Grad Max: 0.011158 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001578 | Grad Max: 0.427672 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028762 | Grad Max: 2.365088 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000169 | Grad Max: 0.006805 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014972 | Grad Max: 0.082522 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000347 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002970 | Grad Max: 0.007981 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000163 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000680 | Grad Max: 0.002098 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000393 | Grad Max: 0.001534 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008739 | Grad Max: 0.008739 [GRADIENT NORM TOTAL] 5.4385 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.861 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54309046 0.4569095 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.080 [MASKS] A(Pass/Fail): 691/1357 | B: 646/1402 | C: 575/1473 [LOSS Ex1] A: 0.63547 | B: 0.61322 | C: 0.61056 [LOGITS Ex2 A] Mean Abs: 2.160 | Max: 6.502 [LOSS Ex2] A: 0.10693 | B: 0.33010 | C: 0.22476 ** [JOINT LOSS] ** : 0.840349 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011054 | Grad Max: 0.293140 -> Layer: shared_layers.0.bias | Grad Mean: 0.897630 | Grad Max: 3.743378 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005659 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010013 | Grad Max: 0.010013 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005526 | Grad Max: 0.693237 -> Layer: exit2_layers.0.bias | Grad Mean: 0.103252 | Grad Max: 3.722496 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000630 | Grad Max: 0.020459 -> Layer: exit2_layers.3.bias | Grad Mean: 0.057446 | Grad Max: 0.294239 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000833 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011192 | Grad Max: 0.021596 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000533 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002732 | Grad Max: 0.008231 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001473 | Grad Max: 0.003763 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040755 | Grad Max: 0.040755 [GRADIENT NORM TOTAL] 18.4589 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.078 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8086036 0.19139645] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 751/1297 | B: 654/1394 | C: 597/1451 [LOSS Ex1] A: 0.62895 | B: 0.60873 | C: 0.60985 [LOGITS Ex2 A] Mean Abs: 2.185 | Max: 7.421 [LOSS Ex2] A: 0.10983 | B: 0.32192 | C: 0.20407 ** [JOINT LOSS] ** : 0.827784 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011175 | Grad Max: 0.316782 -> Layer: shared_layers.0.bias | Grad Mean: 0.996867 | Grad Max: 4.236221 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.006094 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004419 | Grad Max: 0.004419 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006133 | Grad Max: 0.734921 -> Layer: exit2_layers.0.bias | Grad Mean: 0.115342 | Grad Max: 4.077534 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000720 | Grad Max: 0.024475 -> Layer: exit2_layers.3.bias | Grad Mean: 0.065948 | Grad Max: 0.352022 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.000939 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012628 | Grad Max: 0.025520 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000586 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003011 | Grad Max: 0.009182 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001521 | Grad Max: 0.003923 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043046 | Grad Max: 0.043046 [GRADIENT NORM TOTAL] 20.6800 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.198 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002747 0.4997253] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 724/1324 | B: 613/1243 | C: 392/984 [LOSS Ex1] A: 0.63612 | B: 0.61288 | C: 0.61116 [LOGITS Ex2 A] Mean Abs: 2.224 | Max: 6.732 [LOSS Ex2] A: 0.09169 | B: 0.31189 | C: 0.23384 ** [JOINT LOSS] ** : 0.832527 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009426 | Grad Max: 0.253671 -> Layer: shared_layers.0.bias | Grad Mean: 0.612560 | Grad Max: 2.526103 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.005769 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002872 | Grad Max: 0.002872 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003839 | Grad Max: 0.539015 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070943 | Grad Max: 2.923738 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000455 | Grad Max: 0.015638 -> Layer: exit2_layers.3.bias | Grad Mean: 0.041105 | Grad Max: 0.216833 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000651 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008114 | Grad Max: 0.016197 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000362 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001973 | Grad Max: 0.005768 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001004 | Grad Max: 0.002587 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028925 | Grad Max: 0.028925 [GRADIENT NORM TOTAL] 12.4163 [EPOCH SUMMARY] Train Loss: 0.8301 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8105 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8114 -> New: 0.8105) ############################## EPOCH 159/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.844 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7371529 0.2628472] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.084 [MASKS] A(Pass/Fail): 714/1334 | B: 659/1389 | C: 603/1445 [LOSS Ex1] A: 0.63130 | B: 0.61268 | C: 0.60653 [LOGITS Ex2 A] Mean Abs: 2.268 | Max: 6.115 [LOSS Ex2] A: 0.11455 | B: 0.31769 | C: 0.23269 ** [JOINT LOSS] ** : 0.838482 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003503 | Grad Max: 0.230789 -> Layer: shared_layers.0.bias | Grad Mean: 0.506110 | Grad Max: 2.973874 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002115 | Grad Max: 0.005449 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007077 | Grad Max: 0.007077 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003203 | Grad Max: 0.761161 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059187 | Grad Max: 4.240593 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000306 | Grad Max: 0.011886 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028544 | Grad Max: 0.157957 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000417 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005456 | Grad Max: 0.011637 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000274 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001308 | Grad Max: 0.004116 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000589 | Grad Max: 0.001883 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018080 | Grad Max: 0.018080 [GRADIENT NORM TOTAL] 12.2036 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.978 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6396761 0.3603239] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 590/1026 | B: 646/1402 | C: 582/1466 [LOSS Ex1] A: 0.62944 | B: 0.61312 | C: 0.60527 [LOGITS Ex2 A] Mean Abs: 2.332 | Max: 8.350 [LOSS Ex2] A: 0.10723 | B: 0.33608 | C: 0.19948 ** [JOINT LOSS] ** : 0.830208 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006919 | Grad Max: 0.396834 -> Layer: shared_layers.0.bias | Grad Mean: 0.927320 | Grad Max: 5.301015 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.005724 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008960 | Grad Max: 0.008960 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005959 | Grad Max: 1.157272 -> Layer: exit2_layers.0.bias | Grad Mean: 0.111614 | Grad Max: 6.443532 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000609 | Grad Max: 0.021855 -> Layer: exit2_layers.3.bias | Grad Mean: 0.056698 | Grad Max: 0.303807 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000838 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010893 | Grad Max: 0.021766 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000496 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002638 | Grad Max: 0.007871 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001229 | Grad Max: 0.003008 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036963 | Grad Max: 0.036963 [GRADIENT NORM TOTAL] 21.8708 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.199 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081272 0.4918728] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 723/1325 | B: 654/1394 | C: 585/1463 [LOSS Ex1] A: 0.63018 | B: 0.60864 | C: 0.60773 [LOGITS Ex2 A] Mean Abs: 2.299 | Max: 9.558 [LOSS Ex2] A: 0.09581 | B: 0.30743 | C: 0.24821 ** [JOINT LOSS] ** : 0.832664 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007641 | Grad Max: 0.338162 -> Layer: shared_layers.0.bias | Grad Mean: 0.834787 | Grad Max: 4.455676 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.005305 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003098 | Grad Max: 0.003098 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005438 | Grad Max: 0.899949 -> Layer: exit2_layers.0.bias | Grad Mean: 0.100932 | Grad Max: 5.022822 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000568 | Grad Max: 0.020620 -> Layer: exit2_layers.3.bias | Grad Mean: 0.052738 | Grad Max: 0.293145 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000712 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010319 | Grad Max: 0.020872 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000469 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002508 | Grad Max: 0.007524 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001233 | Grad Max: 0.003069 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035961 | Grad Max: 0.035961 [GRADIENT NORM TOTAL] 18.9772 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.131 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50656617 0.4934338 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 720/1328 | B: 613/1243 | C: 602/1446 [LOSS Ex1] A: 0.62686 | B: 0.61279 | C: 0.60848 [LOGITS Ex2 A] Mean Abs: 2.256 | Max: 6.641 [LOSS Ex2] A: 0.10221 | B: 0.29149 | C: 0.22059 ** [JOINT LOSS] ** : 0.820810 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002261 | Grad Max: 0.061074 -> Layer: shared_layers.0.bias | Grad Mean: 0.132298 | Grad Max: 0.762006 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.006564 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007361 | Grad Max: 0.007361 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000907 | Grad Max: 0.323117 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016169 | Grad Max: 1.791312 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.003371 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003509 | Grad Max: 0.037034 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000150 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000720 | Grad Max: 0.003465 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000077 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000208 | Grad Max: 0.000894 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000381 | Grad Max: 0.001065 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003108 | Grad Max: 0.003108 [GRADIENT NORM TOTAL] 4.0573 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.165 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50930864 0.49069136] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 721/1327 | B: 659/1389 | C: 627/1421 [LOSS Ex1] A: 0.62582 | B: 0.61259 | C: 0.60331 [LOGITS Ex2 A] Mean Abs: 2.181 | Max: 7.189 [LOSS Ex2] A: 0.10594 | B: 0.34546 | C: 0.22493 ** [JOINT LOSS] ** : 0.839355 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006781 | Grad Max: 0.300614 -> Layer: shared_layers.0.bias | Grad Mean: 0.799004 | Grad Max: 3.994755 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002222 | Grad Max: 0.006410 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001450 | Grad Max: 0.001450 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005118 | Grad Max: 0.862552 -> Layer: exit2_layers.0.bias | Grad Mean: 0.095954 | Grad Max: 4.778863 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000554 | Grad Max: 0.018900 -> Layer: exit2_layers.3.bias | Grad Mean: 0.051502 | Grad Max: 0.275887 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000742 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009892 | Grad Max: 0.020329 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000424 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002432 | Grad Max: 0.006935 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001268 | Grad Max: 0.002935 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036361 | Grad Max: 0.036361 [GRADIENT NORM TOTAL] 17.8803 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.965 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009972 0.49900278] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.083 [MASKS] A(Pass/Fail): 687/1361 | B: 646/1402 | C: 595/1453 [LOSS Ex1] A: 0.63476 | B: 0.61303 | C: 0.61235 [LOGITS Ex2 A] Mean Abs: 2.156 | Max: 5.709 [LOSS Ex2] A: 0.10609 | B: 0.34079 | C: 0.23944 ** [JOINT LOSS] ** : 0.848825 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008473 | Grad Max: 0.309253 -> Layer: shared_layers.0.bias | Grad Mean: 0.894625 | Grad Max: 4.017837 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002058 | Grad Max: 0.005430 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000062 | Grad Max: 0.000062 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005674 | Grad Max: 1.003153 -> Layer: exit2_layers.0.bias | Grad Mean: 0.106524 | Grad Max: 5.565329 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000637 | Grad Max: 0.022264 -> Layer: exit2_layers.3.bias | Grad Mean: 0.058999 | Grad Max: 0.320150 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000796 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011386 | Grad Max: 0.022087 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000541 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002810 | Grad Max: 0.008773 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001522 | Grad Max: 0.003485 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042909 | Grad Max: 0.042909 [GRADIENT NORM TOTAL] 19.5582 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.863 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5430183 0.45698175] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.080 [MASKS] A(Pass/Fail): 691/1357 | B: 654/1394 | C: 575/1473 [LOSS Ex1] A: 0.63535 | B: 0.60855 | C: 0.61078 [LOGITS Ex2 A] Mean Abs: 2.165 | Max: 6.020 [LOSS Ex2] A: 0.10657 | B: 0.30608 | C: 0.21916 ** [JOINT LOSS] ** : 0.828827 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003550 | Grad Max: 0.179281 -> Layer: shared_layers.0.bias | Grad Mean: 0.423705 | Grad Max: 2.449785 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005676 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004495 | Grad Max: 0.004495 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002567 | Grad Max: 0.751546 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047745 | Grad Max: 4.159624 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000266 | Grad Max: 0.008659 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024598 | Grad Max: 0.143189 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000364 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004735 | Grad Max: 0.010315 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000255 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001157 | Grad Max: 0.003710 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000593 | Grad Max: 0.002053 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017030 | Grad Max: 0.017030 [GRADIENT NORM TOTAL] 10.4199 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.081 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8091866 0.19081348] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 751/1297 | B: 614/1242 | C: 595/1453 [LOSS Ex1] A: 0.62883 | B: 0.61269 | C: 0.60959 [LOGITS Ex2 A] Mean Abs: 2.265 | Max: 6.028 [LOSS Ex2] A: 0.10398 | B: 0.29693 | C: 0.23802 ** [JOINT LOSS] ** : 0.830014 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007396 | Grad Max: 0.174464 -> Layer: shared_layers.0.bias | Grad Mean: 0.526854 | Grad Max: 2.245493 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.005535 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003248 | Grad Max: 0.003248 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003779 | Grad Max: 0.541816 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069401 | Grad Max: 3.028430 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000424 | Grad Max: 0.013114 -> Layer: exit2_layers.3.bias | Grad Mean: 0.038754 | Grad Max: 0.197482 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000582 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007735 | Grad Max: 0.015133 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000380 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001896 | Grad Max: 0.005591 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000928 | Grad Max: 0.002465 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026609 | Grad Max: 0.026609 [GRADIENT NORM TOTAL] 11.6302 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.201 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500279 0.49972105] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 723/1325 | B: 659/1389 | C: 597/1451 [LOSS Ex1] A: 0.63601 | B: 0.61250 | C: 0.60227 [LOGITS Ex2 A] Mean Abs: 2.286 | Max: 6.708 [LOSS Ex2] A: 0.09565 | B: 0.31617 | C: 0.22020 ** [JOINT LOSS] ** : 0.827595 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007869 | Grad Max: 0.225675 -> Layer: shared_layers.0.bias | Grad Mean: 0.705917 | Grad Max: 3.113297 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002088 | Grad Max: 0.005317 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004493 | Grad Max: 0.004493 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004707 | Grad Max: 0.583837 -> Layer: exit2_layers.0.bias | Grad Mean: 0.087936 | Grad Max: 3.267220 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000528 | Grad Max: 0.016457 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048694 | Grad Max: 0.258553 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000739 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009523 | Grad Max: 0.019559 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000458 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002326 | Grad Max: 0.007121 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001177 | Grad Max: 0.002958 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033818 | Grad Max: 0.033818 [GRADIENT NORM TOTAL] 15.5886 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.847 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.73758537 0.2624146 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.085 [MASKS] A(Pass/Fail): 714/1334 | B: 646/1402 | C: 580/1468 [LOSS Ex1] A: 0.63119 | B: 0.61294 | C: 0.60991 [LOGITS Ex2 A] Mean Abs: 2.231 | Max: 7.140 [LOSS Ex2] A: 0.11624 | B: 0.30848 | C: 0.21346 ** [JOINT LOSS] ** : 0.830739 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002700 | Grad Max: 0.069846 -> Layer: shared_layers.0.bias | Grad Mean: 0.243449 | Grad Max: 1.092979 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.005619 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004798 | Grad Max: 0.004798 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001849 | Grad Max: 0.380293 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033533 | Grad Max: 2.115530 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000149 | Grad Max: 0.006632 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013951 | Grad Max: 0.067854 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000220 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002630 | Grad Max: 0.006323 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000136 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000620 | Grad Max: 0.001903 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000396 | Grad Max: 0.001450 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008347 | Grad Max: 0.008347 [GRADIENT NORM TOTAL] 6.4492 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.980 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.63990515 0.36009485] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 590/1026 | B: 654/1394 | C: 613/1435 [LOSS Ex1] A: 0.62933 | B: 0.60845 | C: 0.60240 [LOGITS Ex2 A] Mean Abs: 2.223 | Max: 9.316 [LOSS Ex2] A: 0.11598 | B: 0.30103 | C: 0.19701 ** [JOINT LOSS] ** : 0.818068 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008342 | Grad Max: 0.209438 -> Layer: shared_layers.0.bias | Grad Mean: 0.628049 | Grad Max: 2.808234 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006230 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009408 | Grad Max: 0.009408 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004002 | Grad Max: 0.473613 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074373 | Grad Max: 2.637120 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.015395 -> Layer: exit2_layers.3.bias | Grad Mean: 0.042116 | Grad Max: 0.220122 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000643 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008297 | Grad Max: 0.016603 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000406 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002027 | Grad Max: 0.006210 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001046 | Grad Max: 0.003079 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029628 | Grad Max: 0.029628 [GRADIENT NORM TOTAL] 13.1303 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.201 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081071 0.4918929] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 723/1325 | B: 614/1242 | C: 564/1484 [LOSS Ex1] A: 0.63007 | B: 0.61260 | C: 0.60957 [LOGITS Ex2 A] Mean Abs: 2.207 | Max: 8.820 [LOSS Ex2] A: 0.09776 | B: 0.32490 | C: 0.20816 ** [JOINT LOSS] ** : 0.827684 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009911 | Grad Max: 0.250778 -> Layer: shared_layers.0.bias | Grad Mean: 0.759239 | Grad Max: 3.540335 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002083 | Grad Max: 0.005569 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002111 | Grad Max: 0.002112 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005097 | Grad Max: 0.582361 -> Layer: exit2_layers.0.bias | Grad Mean: 0.094263 | Grad Max: 3.214533 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000578 | Grad Max: 0.017370 -> Layer: exit2_layers.3.bias | Grad Mean: 0.053080 | Grad Max: 0.255801 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000809 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010545 | Grad Max: 0.021792 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000499 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002569 | Grad Max: 0.008079 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001323 | Grad Max: 0.003563 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037080 | Grad Max: 0.037080 [GRADIENT NORM TOTAL] 16.0767 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.133 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50654465 0.4934554 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 720/1328 | B: 659/1389 | C: 584/1464 [LOSS Ex1] A: 0.62675 | B: 0.61241 | C: 0.60694 [LOGITS Ex2 A] Mean Abs: 2.254 | Max: 6.384 [LOSS Ex2] A: 0.09918 | B: 0.32093 | C: 0.22022 ** [JOINT LOSS] ** : 0.828809 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003316 | Grad Max: 0.086155 -> Layer: shared_layers.0.bias | Grad Mean: 0.235363 | Grad Max: 1.112454 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002179 | Grad Max: 0.006439 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001098 | Grad Max: 0.001098 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001851 | Grad Max: 0.231496 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033415 | Grad Max: 1.252288 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000196 | Grad Max: 0.006739 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018283 | Grad Max: 0.108397 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000293 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003663 | Grad Max: 0.008174 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000191 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000927 | Grad Max: 0.002771 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000493 | Grad Max: 0.001837 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014105 | Grad Max: 0.014105 [GRADIENT NORM TOTAL] 5.6994 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.167 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5093321 0.49066794] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.084 [MASKS] A(Pass/Fail): 721/1327 | B: 646/1402 | C: 374/1002 [LOSS Ex1] A: 0.62571 | B: 0.61285 | C: 0.60670 [LOGITS Ex2 A] Mean Abs: 2.273 | Max: 6.249 [LOSS Ex2] A: 0.13349 | B: 0.33118 | C: 0.21824 ** [JOINT LOSS] ** : 0.842724 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011451 | Grad Max: 0.393806 -> Layer: shared_layers.0.bias | Grad Mean: 1.050436 | Grad Max: 5.167544 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.005920 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004422 | Grad Max: 0.004422 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006586 | Grad Max: 1.293918 -> Layer: exit2_layers.0.bias | Grad Mean: 0.122334 | Grad Max: 7.189813 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000708 | Grad Max: 0.023341 -> Layer: exit2_layers.3.bias | Grad Mean: 0.065541 | Grad Max: 0.327661 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.001016 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012916 | Grad Max: 0.026528 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000616 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003176 | Grad Max: 0.009411 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001589 | Grad Max: 0.003524 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045557 | Grad Max: 0.045557 [GRADIENT NORM TOTAL] 23.3377 [EPOCH SUMMARY] Train Loss: 0.8318 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8409 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 160/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.967 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500957 0.49904302] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.083 [MASKS] A(Pass/Fail): 687/1361 | B: 654/1394 | C: 589/1459 [LOSS Ex1] A: 0.63465 | B: 0.60837 | C: 0.60905 [LOGITS Ex2 A] Mean Abs: 2.259 | Max: 5.619 [LOSS Ex2] A: 0.11481 | B: 0.34056 | C: 0.26131 ** [JOINT LOSS] ** : 0.856248 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014766 | Grad Max: 0.528722 -> Layer: shared_layers.0.bias | Grad Mean: 1.455103 | Grad Max: 7.137743 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005487 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004013 | Grad Max: 0.004013 -> Layer: exit2_layers.0.weight | Grad Mean: 0.009161 | Grad Max: 1.609906 -> Layer: exit2_layers.0.bias | Grad Mean: 0.170950 | Grad Max: 8.957437 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001000 | Grad Max: 0.033298 -> Layer: exit2_layers.3.bias | Grad Mean: 0.092770 | Grad Max: 0.492437 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000131 | Grad Max: 0.001247 -> Layer: exit2_layers.6.bias | Grad Mean: 0.018102 | Grad Max: 0.035314 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000839 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004431 | Grad Max: 0.013405 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002156 | Grad Max: 0.004902 -> Layer: exit2_layers.12.bias | Grad Mean: 0.063153 | Grad Max: 0.063153 [GRADIENT NORM TOTAL] 32.2260 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.135 | Max: 0.865 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5429642 0.45703575] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.560 | Std: 0.080 [MASKS] A(Pass/Fail): 691/1357 | B: 614/1242 | C: 590/1458 [LOSS Ex1] A: 0.63524 | B: 0.61251 | C: 0.61112 [LOGITS Ex2 A] Mean Abs: 2.246 | Max: 6.637 [LOSS Ex2] A: 0.11994 | B: 0.32622 | C: 0.23867 ** [JOINT LOSS] ** : 0.847899 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.013001 | Grad Max: 0.439070 -> Layer: shared_layers.0.bias | Grad Mean: 1.211040 | Grad Max: 5.760216 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.005286 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003647 | Grad Max: 0.003647 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007689 | Grad Max: 1.314479 -> Layer: exit2_layers.0.bias | Grad Mean: 0.143750 | Grad Max: 7.307137 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000846 | Grad Max: 0.028756 -> Layer: exit2_layers.3.bias | Grad Mean: 0.078325 | Grad Max: 0.411641 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001035 -> Layer: exit2_layers.6.bias | Grad Mean: 0.015296 | Grad Max: 0.029526 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000692 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003781 | Grad Max: 0.010817 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001931 | Grad Max: 0.004127 -> Layer: exit2_layers.12.bias | Grad Mean: 0.055216 | Grad Max: 0.055216 [GRADIENT NORM TOTAL] 26.5644 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.084 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.80968964 0.19031039] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 751/1297 | B: 660/1388 | C: 583/1465 [LOSS Ex1] A: 0.62872 | B: 0.61234 | C: 0.60595 [LOGITS Ex2 A] Mean Abs: 2.239 | Max: 7.752 [LOSS Ex2] A: 0.10471 | B: 0.32549 | C: 0.21150 ** [JOINT LOSS] ** : 0.829570 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006461 | Grad Max: 0.181002 -> Layer: shared_layers.0.bias | Grad Mean: 0.396987 | Grad Max: 2.303215 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.005805 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001934 | Grad Max: 0.001934 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002903 | Grad Max: 0.533817 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053356 | Grad Max: 2.965971 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000325 | Grad Max: 0.009574 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029477 | Grad Max: 0.136624 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000462 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005888 | Grad Max: 0.011923 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000271 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001469 | Grad Max: 0.004351 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000741 | Grad Max: 0.002188 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021193 | Grad Max: 0.021193 [GRADIENT NORM TOTAL] 9.2871 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.204 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5002959 0.49970412] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.083 [MASKS] A(Pass/Fail): 723/1325 | B: 646/1402 | C: 583/1465 [LOSS Ex1] A: 0.63590 | B: 0.61278 | C: 0.60948 [LOGITS Ex2 A] Mean Abs: 2.172 | Max: 6.958 [LOSS Ex2] A: 0.09342 | B: 0.34892 | C: 0.24674 ** [JOINT LOSS] ** : 0.849078 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009332 | Grad Max: 0.309701 -> Layer: shared_layers.0.bias | Grad Mean: 0.882837 | Grad Max: 4.097857 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002053 | Grad Max: 0.005179 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004135 | Grad Max: 0.004135 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005496 | Grad Max: 0.739414 -> Layer: exit2_layers.0.bias | Grad Mean: 0.102408 | Grad Max: 4.097509 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000626 | Grad Max: 0.020551 -> Layer: exit2_layers.3.bias | Grad Mean: 0.058563 | Grad Max: 0.301898 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000824 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011509 | Grad Max: 0.022830 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000545 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002852 | Grad Max: 0.008426 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001487 | Grad Max: 0.003620 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043026 | Grad Max: 0.043026 [GRADIENT NORM TOTAL] 18.7568 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.849 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.737969 0.262031] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.085 [MASKS] A(Pass/Fail): 714/1334 | B: 654/1394 | C: 605/1443 [LOSS Ex1] A: 0.63107 | B: 0.60829 | C: 0.60907 [LOGITS Ex2 A] Mean Abs: 2.107 | Max: 6.597 [LOSS Ex2] A: 0.13215 | B: 0.36784 | C: 0.22182 ** [JOINT LOSS] ** : 0.856742 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.015805 | Grad Max: 0.440227 -> Layer: shared_layers.0.bias | Grad Mean: 1.316223 | Grad Max: 5.757745 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002111 | Grad Max: 0.005861 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003058 | Grad Max: 0.003058 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008396 | Grad Max: 1.026041 -> Layer: exit2_layers.0.bias | Grad Mean: 0.156064 | Grad Max: 5.796410 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000976 | Grad Max: 0.031182 -> Layer: exit2_layers.3.bias | Grad Mean: 0.090391 | Grad Max: 0.469743 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000129 | Grad Max: 0.001280 -> Layer: exit2_layers.6.bias | Grad Mean: 0.017801 | Grad Max: 0.035178 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000825 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004410 | Grad Max: 0.013092 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002223 | Grad Max: 0.004992 -> Layer: exit2_layers.12.bias | Grad Mean: 0.064289 | Grad Max: 0.064289 [GRADIENT NORM TOTAL] 27.6736 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.983 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6400871 0.3599129] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 590/1026 | B: 614/1242 | C: 592/1456 [LOSS Ex1] A: 0.62921 | B: 0.61244 | C: 0.60802 [LOGITS Ex2 A] Mean Abs: 2.168 | Max: 9.929 [LOSS Ex2] A: 0.11207 | B: 0.35828 | C: 0.21036 ** [JOINT LOSS] ** : 0.843459 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012903 | Grad Max: 0.388497 -> Layer: shared_layers.0.bias | Grad Mean: 1.140571 | Grad Max: 5.111253 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.006220 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000286 | Grad Max: 0.000286 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007509 | Grad Max: 0.987360 -> Layer: exit2_layers.0.bias | Grad Mean: 0.140045 | Grad Max: 5.508556 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000855 | Grad Max: 0.027912 -> Layer: exit2_layers.3.bias | Grad Mean: 0.079450 | Grad Max: 0.416290 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000113 | Grad Max: 0.001118 -> Layer: exit2_layers.6.bias | Grad Mean: 0.015560 | Grad Max: 0.031546 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000703 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003863 | Grad Max: 0.011342 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001891 | Grad Max: 0.004119 -> Layer: exit2_layers.12.bias | Grad Mean: 0.055518 | Grad Max: 0.055518 [GRADIENT NORM TOTAL] 24.9343 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.205 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50813186 0.49186814] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 723/1325 | B: 660/1388 | C: 580/1468 [LOSS Ex1] A: 0.62995 | B: 0.61227 | C: 0.60662 [LOGITS Ex2 A] Mean Abs: 2.185 | Max: 9.265 [LOSS Ex2] A: 0.09490 | B: 0.32593 | C: 0.21840 ** [JOINT LOSS] ** : 0.829357 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007070 | Grad Max: 0.235820 -> Layer: shared_layers.0.bias | Grad Mean: 0.624730 | Grad Max: 3.197259 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.005454 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000759 | Grad Max: 0.000759 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003969 | Grad Max: 0.727640 -> Layer: exit2_layers.0.bias | Grad Mean: 0.072888 | Grad Max: 4.093098 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000426 | Grad Max: 0.013317 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039477 | Grad Max: 0.194243 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000583 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007901 | Grad Max: 0.015931 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000438 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001961 | Grad Max: 0.006480 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000970 | Grad Max: 0.002995 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028222 | Grad Max: 0.028222 [GRADIENT NORM TOTAL] 13.9348 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.136 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5064795 0.49352053] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 720/1328 | B: 646/1402 | C: 592/1456 [LOSS Ex1] A: 0.62663 | B: 0.61271 | C: 0.60729 [LOGITS Ex2 A] Mean Abs: 2.234 | Max: 5.805 [LOSS Ex2] A: 0.11035 | B: 0.31355 | C: 0.23735 ** [JOINT LOSS] ** : 0.835960 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008281 | Grad Max: 0.271023 -> Layer: shared_layers.0.bias | Grad Mean: 0.799141 | Grad Max: 3.543289 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.005811 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002548 | Grad Max: 0.002548 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005197 | Grad Max: 0.795315 -> Layer: exit2_layers.0.bias | Grad Mean: 0.097112 | Grad Max: 4.401843 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000589 | Grad Max: 0.018592 -> Layer: exit2_layers.3.bias | Grad Mean: 0.054990 | Grad Max: 0.276612 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000817 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010742 | Grad Max: 0.022312 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000467 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002671 | Grad Max: 0.007599 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001274 | Grad Max: 0.003312 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037991 | Grad Max: 0.037991 [GRADIENT NORM TOTAL] 17.3426 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.170 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50940764 0.4905924 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 721/1327 | B: 654/1394 | C: 578/1470 [LOSS Ex1] A: 0.62560 | B: 0.60822 | C: 0.60714 [LOGITS Ex2 A] Mean Abs: 2.255 | Max: 8.633 [LOSS Ex2] A: 0.13043 | B: 0.34194 | C: 0.24391 ** [JOINT LOSS] ** : 0.852412 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014919 | Grad Max: 0.439405 -> Layer: shared_layers.0.bias | Grad Mean: 1.304749 | Grad Max: 5.844718 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.006568 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001194 | Grad Max: 0.001194 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008635 | Grad Max: 1.220349 -> Layer: exit2_layers.0.bias | Grad Mean: 0.160856 | Grad Max: 6.852180 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000968 | Grad Max: 0.031964 -> Layer: exit2_layers.3.bias | Grad Mean: 0.089790 | Grad Max: 0.483073 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000128 | Grad Max: 0.001206 -> Layer: exit2_layers.6.bias | Grad Mean: 0.017775 | Grad Max: 0.034101 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000811 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004430 | Grad Max: 0.013120 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002164 | Grad Max: 0.004465 -> Layer: exit2_layers.12.bias | Grad Mean: 0.063131 | Grad Max: 0.063131 [GRADIENT NORM TOTAL] 28.5682 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.970 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50096303 0.499037 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.083 [MASKS] A(Pass/Fail): 687/1361 | B: 614/1242 | C: 588/1460 [LOSS Ex1] A: 0.63454 | B: 0.61237 | C: 0.60969 [LOGITS Ex2 A] Mean Abs: 2.231 | Max: 5.392 [LOSS Ex2] A: 0.10552 | B: 0.31439 | C: 0.23968 ** [JOINT LOSS] ** : 0.838733 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009013 | Grad Max: 0.374025 -> Layer: shared_layers.0.bias | Grad Mean: 0.997467 | Grad Max: 4.787257 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002050 | Grad Max: 0.005419 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005730 | Grad Max: 0.005730 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006346 | Grad Max: 0.856108 -> Layer: exit2_layers.0.bias | Grad Mean: 0.118237 | Grad Max: 4.718607 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000685 | Grad Max: 0.023632 -> Layer: exit2_layers.3.bias | Grad Mean: 0.064520 | Grad Max: 0.343666 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000839 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012734 | Grad Max: 0.024544 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000606 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003253 | Grad Max: 0.009651 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001553 | Grad Max: 0.003758 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047211 | Grad Max: 0.047211 [GRADIENT NORM TOTAL] 21.5971 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.867 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5429509 0.45704907] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.080 [MASKS] A(Pass/Fail): 691/1357 | B: 660/1388 | C: 646/1402 [LOSS Ex1] A: 0.63515 | B: 0.61220 | C: 0.59839 [LOGITS Ex2 A] Mean Abs: 2.175 | Max: 5.952 [LOSS Ex2] A: 0.11468 | B: 0.31766 | C: 0.21599 ** [JOINT LOSS] ** : 0.831356 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002764 | Grad Max: 0.127330 -> Layer: shared_layers.0.bias | Grad Mean: 0.279675 | Grad Max: 1.651603 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.006020 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010517 | Grad Max: 0.010517 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001784 | Grad Max: 0.248521 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032997 | Grad Max: 1.385486 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000202 | Grad Max: 0.007199 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018817 | Grad Max: 0.115625 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000297 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003639 | Grad Max: 0.008316 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000199 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000900 | Grad Max: 0.002734 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000433 | Grad Max: 0.001558 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012725 | Grad Max: 0.012725 [GRADIENT NORM TOTAL] 5.9725 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.81014526 0.18985473] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.085 [MASKS] A(Pass/Fail): 751/1297 | B: 646/1402 | C: 573/1475 [LOSS Ex1] A: 0.62862 | B: 0.61265 | C: 0.60716 [LOGITS Ex2 A] Mean Abs: 2.126 | Max: 7.430 [LOSS Ex2] A: 0.10380 | B: 0.35090 | C: 0.23709 ** [JOINT LOSS] ** : 0.846737 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008511 | Grad Max: 0.334128 -> Layer: shared_layers.0.bias | Grad Mean: 0.929512 | Grad Max: 4.536291 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.005503 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000658 | Grad Max: 0.000658 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005888 | Grad Max: 0.986504 -> Layer: exit2_layers.0.bias | Grad Mean: 0.110723 | Grad Max: 5.463632 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000632 | Grad Max: 0.020752 -> Layer: exit2_layers.3.bias | Grad Mean: 0.058991 | Grad Max: 0.300397 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000799 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011462 | Grad Max: 0.022680 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000547 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002869 | Grad Max: 0.008912 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001436 | Grad Max: 0.003300 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042100 | Grad Max: 0.042100 [GRADIENT NORM TOTAL] 20.6991 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.206 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003158 0.49968415] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.084 [MASKS] A(Pass/Fail): 723/1325 | B: 655/1393 | C: 615/1433 [LOSS Ex1] A: 0.63580 | B: 0.60816 | C: 0.60573 [LOGITS Ex2 A] Mean Abs: 2.106 | Max: 5.678 [LOSS Ex2] A: 0.10618 | B: 0.37697 | C: 0.24629 ** [JOINT LOSS] ** : 0.859710 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.013417 | Grad Max: 0.471862 -> Layer: shared_layers.0.bias | Grad Mean: 1.367699 | Grad Max: 6.252672 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005121 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000523 | Grad Max: 0.000523 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008540 | Grad Max: 1.280424 -> Layer: exit2_layers.0.bias | Grad Mean: 0.159934 | Grad Max: 7.092551 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000961 | Grad Max: 0.032648 -> Layer: exit2_layers.3.bias | Grad Mean: 0.089994 | Grad Max: 0.468748 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000126 | Grad Max: 0.001174 -> Layer: exit2_layers.6.bias | Grad Mean: 0.017661 | Grad Max: 0.033426 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000830 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004421 | Grad Max: 0.012819 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002184 | Grad Max: 0.004940 -> Layer: exit2_layers.12.bias | Grad Mean: 0.064705 | Grad Max: 0.064705 [GRADIENT NORM TOTAL] 29.3540 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.850 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7383021 0.26169786] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.085 [MASKS] A(Pass/Fail): 714/1334 | B: 614/1242 | C: 418/958 [LOSS Ex1] A: 0.63097 | B: 0.61231 | C: 0.60197 [LOGITS Ex2 A] Mean Abs: 2.062 | Max: 6.724 [LOSS Ex2] A: 0.12518 | B: 0.37553 | C: 0.20809 ** [JOINT LOSS] ** : 0.851350 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014967 | Grad Max: 0.432222 -> Layer: shared_layers.0.bias | Grad Mean: 1.340253 | Grad Max: 5.807350 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005519 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004659 | Grad Max: 0.004659 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008464 | Grad Max: 1.121580 -> Layer: exit2_layers.0.bias | Grad Mean: 0.158909 | Grad Max: 6.241195 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000975 | Grad Max: 0.031505 -> Layer: exit2_layers.3.bias | Grad Mean: 0.090971 | Grad Max: 0.468691 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000129 | Grad Max: 0.001203 -> Layer: exit2_layers.6.bias | Grad Mean: 0.017844 | Grad Max: 0.034222 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000815 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004473 | Grad Max: 0.013015 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002194 | Grad Max: 0.004686 -> Layer: exit2_layers.12.bias | Grad Mean: 0.064594 | Grad Max: 0.064594 [GRADIENT NORM TOTAL] 28.3901 [EPOCH SUMMARY] Train Loss: 0.8449 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8220 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 161/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.984 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64020723 0.35979277] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 590/1026 | B: 660/1388 | C: 581/1467 [LOSS Ex1] A: 0.62911 | B: 0.61215 | C: 0.60877 [LOGITS Ex2 A] Mean Abs: 2.146 | Max: 8.113 [LOSS Ex2] A: 0.11459 | B: 0.34426 | C: 0.24992 ** [JOINT LOSS] ** : 0.852935 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008828 | Grad Max: 0.303019 -> Layer: shared_layers.0.bias | Grad Mean: 0.855269 | Grad Max: 4.078129 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.005974 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006967 | Grad Max: 0.006968 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005205 | Grad Max: 0.942813 -> Layer: exit2_layers.0.bias | Grad Mean: 0.097326 | Grad Max: 5.238788 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000573 | Grad Max: 0.019187 -> Layer: exit2_layers.3.bias | Grad Mean: 0.053501 | Grad Max: 0.269647 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000809 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010491 | Grad Max: 0.022177 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000433 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002669 | Grad Max: 0.007078 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001371 | Grad Max: 0.003093 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040277 | Grad Max: 0.040277 [GRADIENT NORM TOTAL] 18.6789 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.207 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50815976 0.49184027] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 723/1325 | B: 647/1401 | C: 614/1434 [LOSS Ex1] A: 0.62986 | B: 0.61259 | C: 0.60282 [LOGITS Ex2 A] Mean Abs: 2.208 | Max: 7.621 [LOSS Ex2] A: 0.09722 | B: 0.31477 | C: 0.19234 ** [JOINT LOSS] ** : 0.816530 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006425 | Grad Max: 0.176928 -> Layer: shared_layers.0.bias | Grad Mean: 0.468144 | Grad Max: 2.309814 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.005651 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000019 | Grad Max: 0.000019 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003155 | Grad Max: 0.491187 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058031 | Grad Max: 2.751588 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000324 | Grad Max: 0.009412 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029770 | Grad Max: 0.139437 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000480 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006119 | Grad Max: 0.012080 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000319 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001559 | Grad Max: 0.004746 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000752 | Grad Max: 0.002619 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022316 | Grad Max: 0.022316 [GRADIENT NORM TOTAL] 10.6617 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.138 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50639325 0.49360672] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 720/1328 | B: 656/1392 | C: 586/1462 [LOSS Ex1] A: 0.62654 | B: 0.60810 | C: 0.60782 [LOGITS Ex2 A] Mean Abs: 2.217 | Max: 5.414 [LOSS Ex2] A: 0.11318 | B: 0.31705 | C: 0.21124 ** [JOINT LOSS] ** : 0.827977 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010309 | Grad Max: 0.356536 -> Layer: shared_layers.0.bias | Grad Mean: 0.941553 | Grad Max: 4.747566 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002281 | Grad Max: 0.006598 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005705 | Grad Max: 0.005705 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006171 | Grad Max: 0.940999 -> Layer: exit2_layers.0.bias | Grad Mean: 0.114857 | Grad Max: 5.266811 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000653 | Grad Max: 0.019791 -> Layer: exit2_layers.3.bias | Grad Mean: 0.061291 | Grad Max: 0.299631 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.000816 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012200 | Grad Max: 0.023819 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000546 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003102 | Grad Max: 0.008790 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001522 | Grad Max: 0.003815 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045557 | Grad Max: 0.045557 [GRADIENT NORM TOTAL] 21.1141 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.172 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50947475 0.49052528] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 721/1327 | B: 614/1242 | C: 592/1456 [LOSS Ex1] A: 0.62550 | B: 0.61225 | C: 0.60975 [LOGITS Ex2 A] Mean Abs: 2.178 | Max: 5.411 [LOSS Ex2] A: 0.12588 | B: 0.30728 | C: 0.22997 ** [JOINT LOSS] ** : 0.836878 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010469 | Grad Max: 0.291542 -> Layer: shared_layers.0.bias | Grad Mean: 0.802743 | Grad Max: 3.777721 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006013 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005749 | Grad Max: 0.005749 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005240 | Grad Max: 0.827562 -> Layer: exit2_layers.0.bias | Grad Mean: 0.096409 | Grad Max: 4.623919 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000567 | Grad Max: 0.016998 -> Layer: exit2_layers.3.bias | Grad Mean: 0.052398 | Grad Max: 0.266506 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000871 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010680 | Grad Max: 0.022067 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000469 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002735 | Grad Max: 0.007766 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001330 | Grad Max: 0.003371 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039651 | Grad Max: 0.039651 [GRADIENT NORM TOTAL] 17.6313 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.971 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009693 0.49903068] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.084 [MASKS] A(Pass/Fail): 687/1361 | B: 660/1388 | C: 594/1454 [LOSS Ex1] A: 0.63445 | B: 0.61209 | C: 0.60793 [LOGITS Ex2 A] Mean Abs: 2.134 | Max: 7.129 [LOSS Ex2] A: 0.10418 | B: 0.32003 | C: 0.21101 ** [JOINT LOSS] ** : 0.829898 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002745 | Grad Max: 0.099065 -> Layer: shared_layers.0.bias | Grad Mean: 0.139755 | Grad Max: 0.930132 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005322 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005786 | Grad Max: 0.005786 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000966 | Grad Max: 0.201411 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017405 | Grad Max: 1.122477 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000089 | Grad Max: 0.003535 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007786 | Grad Max: 0.052553 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000193 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001603 | Grad Max: 0.004293 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000118 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000438 | Grad Max: 0.001602 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000478 | Grad Max: 0.001383 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006035 | Grad Max: 0.006035 [GRADIENT NORM TOTAL] 3.3941 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.868 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5429203 0.45707968] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.080 [MASKS] A(Pass/Fail): 691/1357 | B: 647/1401 | C: 584/1464 [LOSS Ex1] A: 0.63506 | B: 0.61253 | C: 0.60756 [LOGITS Ex2 A] Mean Abs: 2.024 | Max: 5.942 [LOSS Ex2] A: 0.11537 | B: 0.33888 | C: 0.22309 ** [JOINT LOSS] ** : 0.844164 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011673 | Grad Max: 0.308583 -> Layer: shared_layers.0.bias | Grad Mean: 0.910820 | Grad Max: 4.084969 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.005329 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005464 | Grad Max: 0.005464 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006029 | Grad Max: 1.060966 -> Layer: exit2_layers.0.bias | Grad Mean: 0.112320 | Grad Max: 5.882243 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000667 | Grad Max: 0.021103 -> Layer: exit2_layers.3.bias | Grad Mean: 0.062110 | Grad Max: 0.322762 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000814 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012388 | Grad Max: 0.023435 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000617 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003110 | Grad Max: 0.009519 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001556 | Grad Max: 0.003902 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045070 | Grad Max: 0.045070 [GRADIENT NORM TOTAL] 20.1214 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.088 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.81055164 0.18944834] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.085 [MASKS] A(Pass/Fail): 751/1297 | B: 656/1392 | C: 595/1453 [LOSS Ex1] A: 0.62853 | B: 0.60804 | C: 0.60840 [LOGITS Ex2 A] Mean Abs: 2.063 | Max: 6.368 [LOSS Ex2] A: 0.10949 | B: 0.35395 | C: 0.25378 ** [JOINT LOSS] ** : 0.854063 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012816 | Grad Max: 0.396530 -> Layer: shared_layers.0.bias | Grad Mean: 1.175350 | Grad Max: 5.188452 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002162 | Grad Max: 0.005944 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000872 | Grad Max: 0.000872 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007643 | Grad Max: 1.182893 -> Layer: exit2_layers.0.bias | Grad Mean: 0.142581 | Grad Max: 6.539235 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000846 | Grad Max: 0.027465 -> Layer: exit2_layers.3.bias | Grad Mean: 0.079376 | Grad Max: 0.412340 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000113 | Grad Max: 0.001093 -> Layer: exit2_layers.6.bias | Grad Mean: 0.015857 | Grad Max: 0.030678 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000767 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004049 | Grad Max: 0.012402 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001992 | Grad Max: 0.004314 -> Layer: exit2_layers.12.bias | Grad Mean: 0.059023 | Grad Max: 0.059023 [GRADIENT NORM TOTAL] 25.5983 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003362 0.49966374] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.084 [MASKS] A(Pass/Fail): 723/1325 | B: 614/1242 | C: 614/1434 [LOSS Ex1] A: 0.63572 | B: 0.61219 | C: 0.60535 [LOGITS Ex2 A] Mean Abs: 2.080 | Max: 5.536 [LOSS Ex2] A: 0.10526 | B: 0.33345 | C: 0.21694 ** [JOINT LOSS] ** : 0.836299 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011269 | Grad Max: 0.324552 -> Layer: shared_layers.0.bias | Grad Mean: 0.931066 | Grad Max: 4.462290 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.005596 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000423 | Grad Max: 0.000423 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005997 | Grad Max: 0.969616 -> Layer: exit2_layers.0.bias | Grad Mean: 0.111456 | Grad Max: 5.369363 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000659 | Grad Max: 0.022469 -> Layer: exit2_layers.3.bias | Grad Mean: 0.061692 | Grad Max: 0.317006 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000846 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012402 | Grad Max: 0.024315 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000670 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003118 | Grad Max: 0.010489 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001528 | Grad Max: 0.003966 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044529 | Grad Max: 0.044529 [GRADIENT NORM TOTAL] 20.2734 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.852 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7386077 0.2613923] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.085 [MASKS] A(Pass/Fail): 714/1334 | B: 660/1388 | C: 583/1465 [LOSS Ex1] A: 0.63088 | B: 0.61203 | C: 0.60879 [LOGITS Ex2 A] Mean Abs: 2.123 | Max: 6.623 [LOSS Ex2] A: 0.11592 | B: 0.31812 | C: 0.20940 ** [JOINT LOSS] ** : 0.831713 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005207 | Grad Max: 0.151768 -> Layer: shared_layers.0.bias | Grad Mean: 0.235579 | Grad Max: 1.257832 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005258 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004167 | Grad Max: 0.004167 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001646 | Grad Max: 0.438887 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029822 | Grad Max: 2.465098 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000135 | Grad Max: 0.005953 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011795 | Grad Max: 0.070598 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000018 | Grad Max: 0.000292 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002361 | Grad Max: 0.005591 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000169 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000596 | Grad Max: 0.002076 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000422 | Grad Max: 0.001787 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008953 | Grad Max: 0.008953 [GRADIENT NORM TOTAL] 5.8254 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.986 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6403089 0.3596911] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 590/1026 | B: 647/1401 | C: 604/1444 [LOSS Ex1] A: 0.62902 | B: 0.61246 | C: 0.60556 [LOGITS Ex2 A] Mean Abs: 2.235 | Max: 9.290 [LOSS Ex2] A: 0.11120 | B: 0.34056 | C: 0.22825 ** [JOINT LOSS] ** : 0.842349 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009533 | Grad Max: 0.358694 -> Layer: shared_layers.0.bias | Grad Mean: 0.988426 | Grad Max: 4.726862 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.006107 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002286 | Grad Max: 0.002286 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006286 | Grad Max: 0.951131 -> Layer: exit2_layers.0.bias | Grad Mean: 0.117165 | Grad Max: 5.321892 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000673 | Grad Max: 0.021844 -> Layer: exit2_layers.3.bias | Grad Mean: 0.063575 | Grad Max: 0.327912 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000886 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012625 | Grad Max: 0.025510 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000552 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003189 | Grad Max: 0.009560 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001485 | Grad Max: 0.003679 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044985 | Grad Max: 0.044985 [GRADIENT NORM TOTAL] 21.8177 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.210 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081578 0.49184218] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 723/1325 | B: 656/1392 | C: 576/1472 [LOSS Ex1] A: 0.62976 | B: 0.60798 | C: 0.60975 [LOGITS Ex2 A] Mean Abs: 2.240 | Max: 8.202 [LOSS Ex2] A: 0.11755 | B: 0.35946 | C: 0.24298 ** [JOINT LOSS] ** : 0.855824 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.015140 | Grad Max: 0.540683 -> Layer: shared_layers.0.bias | Grad Mean: 1.474462 | Grad Max: 7.200821 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.005681 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006735 | Grad Max: 0.006735 -> Layer: exit2_layers.0.weight | Grad Mean: 0.009487 | Grad Max: 1.556246 -> Layer: exit2_layers.0.bias | Grad Mean: 0.176526 | Grad Max: 8.655821 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001022 | Grad Max: 0.032375 -> Layer: exit2_layers.3.bias | Grad Mean: 0.096072 | Grad Max: 0.487057 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000137 | Grad Max: 0.001331 -> Layer: exit2_layers.6.bias | Grad Mean: 0.019191 | Grad Max: 0.037839 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000056 | Grad Max: 0.000881 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004895 | Grad Max: 0.014409 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002379 | Grad Max: 0.004922 -> Layer: exit2_layers.12.bias | Grad Mean: 0.070518 | Grad Max: 0.070518 [GRADIENT NORM TOTAL] 32.6617 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.140 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5063253 0.49367473] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 719/1329 | B: 614/1242 | C: 610/1438 [LOSS Ex1] A: 0.62644 | B: 0.61212 | C: 0.60542 [LOGITS Ex2 A] Mean Abs: 2.230 | Max: 7.251 [LOSS Ex2] A: 0.12320 | B: 0.34058 | C: 0.25586 ** [JOINT LOSS] ** : 0.854543 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.015828 | Grad Max: 0.480430 -> Layer: shared_layers.0.bias | Grad Mean: 1.332058 | Grad Max: 6.388570 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.006254 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000640 | Grad Max: 0.000640 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008483 | Grad Max: 1.449577 -> Layer: exit2_layers.0.bias | Grad Mean: 0.157754 | Grad Max: 8.069330 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000918 | Grad Max: 0.030032 -> Layer: exit2_layers.3.bias | Grad Mean: 0.086027 | Grad Max: 0.455311 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000124 | Grad Max: 0.001213 -> Layer: exit2_layers.6.bias | Grad Mean: 0.017284 | Grad Max: 0.034699 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000839 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004437 | Grad Max: 0.013258 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002193 | Grad Max: 0.004876 -> Layer: exit2_layers.12.bias | Grad Mean: 0.063962 | Grad Max: 0.063962 [GRADIENT NORM TOTAL] 29.2266 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.175 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5095419 0.49045804] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 722/1326 | B: 660/1388 | C: 612/1436 [LOSS Ex1] A: 0.62541 | B: 0.61197 | C: 0.60031 [LOGITS Ex2 A] Mean Abs: 2.181 | Max: 5.593 [LOSS Ex2] A: 0.12466 | B: 0.32195 | C: 0.22295 ** [JOINT LOSS] ** : 0.835747 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008486 | Grad Max: 0.194835 -> Layer: shared_layers.0.bias | Grad Mean: 0.635235 | Grad Max: 2.800750 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002249 | Grad Max: 0.006457 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008632 | Grad Max: 0.008632 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004172 | Grad Max: 0.551481 -> Layer: exit2_layers.0.bias | Grad Mean: 0.077279 | Grad Max: 3.059848 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000460 | Grad Max: 0.014970 -> Layer: exit2_layers.3.bias | Grad Mean: 0.042767 | Grad Max: 0.213320 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000688 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008602 | Grad Max: 0.017739 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000410 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002144 | Grad Max: 0.006097 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001021 | Grad Max: 0.002669 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029596 | Grad Max: 0.029596 [GRADIENT NORM TOTAL] 13.5706 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.973 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50098044 0.49901956] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.084 [MASKS] A(Pass/Fail): 687/1361 | B: 647/1401 | C: 405/971 [LOSS Ex1] A: 0.63435 | B: 0.61240 | C: 0.60750 [LOGITS Ex2 A] Mean Abs: 2.091 | Max: 5.206 [LOSS Ex2] A: 0.10427 | B: 0.33059 | C: 0.24025 ** [JOINT LOSS] ** : 0.843120 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005556 | Grad Max: 0.212607 -> Layer: shared_layers.0.bias | Grad Mean: 0.562217 | Grad Max: 2.663414 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005431 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006201 | Grad Max: 0.006201 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003592 | Grad Max: 0.687723 -> Layer: exit2_layers.0.bias | Grad Mean: 0.066616 | Grad Max: 3.858468 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000380 | Grad Max: 0.013010 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036071 | Grad Max: 0.199566 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000562 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007284 | Grad Max: 0.015156 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000390 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001897 | Grad Max: 0.005873 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000975 | Grad Max: 0.002784 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028842 | Grad Max: 0.028842 [GRADIENT NORM TOTAL] 12.4873 [EPOCH SUMMARY] Train Loss: 0.8401 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8339 | Alpha: 0.5500 No improve count: 3/15 ############################## EPOCH 162/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.870 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.542831 0.457169] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.080 [MASKS] A(Pass/Fail): 691/1357 | B: 657/1391 | C: 621/1427 [LOSS Ex1] A: 0.63496 | B: 0.60792 | C: 0.60527 [LOGITS Ex2 A] Mean Abs: 2.006 | Max: 5.724 [LOSS Ex2] A: 0.11886 | B: 0.33834 | C: 0.22720 ** [JOINT LOSS] ** : 0.844180 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011118 | Grad Max: 0.334421 -> Layer: shared_layers.0.bias | Grad Mean: 1.021909 | Grad Max: 4.495038 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.005670 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008070 | Grad Max: 0.008070 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006467 | Grad Max: 0.859121 -> Layer: exit2_layers.0.bias | Grad Mean: 0.120986 | Grad Max: 4.828690 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000732 | Grad Max: 0.024013 -> Layer: exit2_layers.3.bias | Grad Mean: 0.068903 | Grad Max: 0.361888 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000098 | Grad Max: 0.000981 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013702 | Grad Max: 0.027774 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000633 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003512 | Grad Max: 0.010810 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001731 | Grad Max: 0.003982 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050764 | Grad Max: 0.050764 [GRADIENT NORM TOTAL] 21.7739 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.091 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.81092757 0.1890724 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.085 [MASKS] A(Pass/Fail): 751/1297 | B: 614/1242 | C: 588/1460 [LOSS Ex1] A: 0.62843 | B: 0.61207 | C: 0.60357 [LOGITS Ex2 A] Mean Abs: 2.076 | Max: 6.226 [LOSS Ex2] A: 0.10353 | B: 0.33489 | C: 0.22732 ** [JOINT LOSS] ** : 0.836607 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007212 | Grad Max: 0.311165 -> Layer: shared_layers.0.bias | Grad Mean: 0.899818 | Grad Max: 4.124870 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002210 | Grad Max: 0.005807 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004439 | Grad Max: 0.004439 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005582 | Grad Max: 0.788126 -> Layer: exit2_layers.0.bias | Grad Mean: 0.104532 | Grad Max: 4.409972 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000627 | Grad Max: 0.020331 -> Layer: exit2_layers.3.bias | Grad Mean: 0.059922 | Grad Max: 0.311315 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000801 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011872 | Grad Max: 0.023300 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000563 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003030 | Grad Max: 0.009520 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001456 | Grad Max: 0.003558 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043182 | Grad Max: 0.043182 [GRADIENT NORM TOTAL] 19.2567 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.211 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003988 0.4996012] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.084 [MASKS] A(Pass/Fail): 723/1325 | B: 660/1388 | C: 595/1453 [LOSS Ex1] A: 0.63562 | B: 0.61191 | C: 0.60738 [LOGITS Ex2 A] Mean Abs: 2.126 | Max: 6.378 [LOSS Ex2] A: 0.09523 | B: 0.31972 | C: 0.21826 ** [JOINT LOSS] ** : 0.829373 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003212 | Grad Max: 0.131001 -> Layer: shared_layers.0.bias | Grad Mean: 0.347285 | Grad Max: 1.617949 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002028 | Grad Max: 0.005381 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004503 | Grad Max: 0.004503 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.297631 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039684 | Grad Max: 1.639256 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000230 | Grad Max: 0.009496 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021822 | Grad Max: 0.131124 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000360 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004428 | Grad Max: 0.009735 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000258 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001143 | Grad Max: 0.003711 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000565 | Grad Max: 0.002340 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016450 | Grad Max: 0.016450 [GRADIENT NORM TOTAL] 7.4713 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.853 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7388318 0.26116812] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.085 [MASKS] A(Pass/Fail): 713/1335 | B: 648/1400 | C: 607/1441 [LOSS Ex1] A: 0.63077 | B: 0.61235 | C: 0.60340 [LOGITS Ex2 A] Mean Abs: 2.192 | Max: 6.114 [LOSS Ex2] A: 0.12217 | B: 0.33165 | C: 0.23210 ** [JOINT LOSS] ** : 0.844144 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009215 | Grad Max: 0.356028 -> Layer: shared_layers.0.bias | Grad Mean: 0.922336 | Grad Max: 4.744699 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.006019 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000592 | Grad Max: 0.000592 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005878 | Grad Max: 0.865191 -> Layer: exit2_layers.0.bias | Grad Mean: 0.109741 | Grad Max: 4.845566 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000622 | Grad Max: 0.021071 -> Layer: exit2_layers.3.bias | Grad Mean: 0.059067 | Grad Max: 0.305509 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000818 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011830 | Grad Max: 0.023053 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000594 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003051 | Grad Max: 0.008826 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001493 | Grad Max: 0.003624 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044148 | Grad Max: 0.044148 [GRADIENT NORM TOTAL] 20.5151 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.988 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64034617 0.35965377] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 590/1026 | B: 657/1391 | C: 578/1470 [LOSS Ex1] A: 0.62891 | B: 0.60786 | C: 0.61011 [LOGITS Ex2 A] Mean Abs: 2.245 | Max: 9.645 [LOSS Ex2] A: 0.11549 | B: 0.34977 | C: 0.23964 ** [JOINT LOSS] ** : 0.850594 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012852 | Grad Max: 0.532753 -> Layer: shared_layers.0.bias | Grad Mean: 1.339190 | Grad Max: 7.069860 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002205 | Grad Max: 0.006310 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011326 | Grad Max: 0.011326 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008578 | Grad Max: 1.427239 -> Layer: exit2_layers.0.bias | Grad Mean: 0.160076 | Grad Max: 7.932341 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000901 | Grad Max: 0.031890 -> Layer: exit2_layers.3.bias | Grad Mean: 0.085697 | Grad Max: 0.444281 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000120 | Grad Max: 0.001127 -> Layer: exit2_layers.6.bias | Grad Mean: 0.017171 | Grad Max: 0.032289 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000815 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004422 | Grad Max: 0.013307 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002112 | Grad Max: 0.004443 -> Layer: exit2_layers.12.bias | Grad Mean: 0.063003 | Grad Max: 0.063003 [GRADIENT NORM TOTAL] 29.9756 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.212 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50821733 0.4917827 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 723/1325 | B: 614/1242 | C: 614/1434 [LOSS Ex1] A: 0.62966 | B: 0.61201 | C: 0.60687 [LOGITS Ex2 A] Mean Abs: 2.186 | Max: 7.123 [LOSS Ex2] A: 0.11596 | B: 0.34282 | C: 0.26388 ** [JOINT LOSS] ** : 0.857064 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.013117 | Grad Max: 0.489573 -> Layer: shared_layers.0.bias | Grad Mean: 1.259133 | Grad Max: 6.520995 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.005363 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000896 | Grad Max: 0.000896 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008067 | Grad Max: 1.342071 -> Layer: exit2_layers.0.bias | Grad Mean: 0.150090 | Grad Max: 7.454862 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000850 | Grad Max: 0.030287 -> Layer: exit2_layers.3.bias | Grad Mean: 0.080491 | Grad Max: 0.435469 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000115 | Grad Max: 0.001143 -> Layer: exit2_layers.6.bias | Grad Mean: 0.016223 | Grad Max: 0.032338 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000779 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004197 | Grad Max: 0.012363 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002049 | Grad Max: 0.004386 -> Layer: exit2_layers.12.bias | Grad Mean: 0.060341 | Grad Max: 0.060341 [GRADIENT NORM TOTAL] 27.8994 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.142 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062383 0.49376172] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 719/1329 | B: 660/1388 | C: 562/1486 [LOSS Ex1] A: 0.62634 | B: 0.61186 | C: 0.60751 [LOGITS Ex2 A] Mean Abs: 2.176 | Max: 5.464 [LOSS Ex2] A: 0.11142 | B: 0.32671 | C: 0.22425 ** [JOINT LOSS] ** : 0.836029 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007207 | Grad Max: 0.246022 -> Layer: shared_layers.0.bias | Grad Mean: 0.682596 | Grad Max: 3.156439 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.006394 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002524 | Grad Max: 0.002524 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004121 | Grad Max: 0.653742 -> Layer: exit2_layers.0.bias | Grad Mean: 0.077226 | Grad Max: 3.650321 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000457 | Grad Max: 0.016062 -> Layer: exit2_layers.3.bias | Grad Mean: 0.043596 | Grad Max: 0.240499 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000630 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008715 | Grad Max: 0.017427 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000437 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002274 | Grad Max: 0.006816 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001086 | Grad Max: 0.002959 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032871 | Grad Max: 0.032871 [GRADIENT NORM TOTAL] 14.6344 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.177 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50963676 0.49036324] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 722/1326 | B: 648/1400 | C: 611/1437 [LOSS Ex1] A: 0.62530 | B: 0.61230 | C: 0.61047 [LOGITS Ex2 A] Mean Abs: 2.063 | Max: 5.504 [LOSS Ex2] A: 0.11858 | B: 0.32475 | C: 0.22959 ** [JOINT LOSS] ** : 0.840331 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004574 | Grad Max: 0.196008 -> Layer: shared_layers.0.bias | Grad Mean: 0.477681 | Grad Max: 2.605604 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.006470 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002137 | Grad Max: 0.002137 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002926 | Grad Max: 0.709886 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054695 | Grad Max: 3.947270 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000312 | Grad Max: 0.012094 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029425 | Grad Max: 0.171063 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000436 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005718 | Grad Max: 0.011408 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000305 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001475 | Grad Max: 0.004619 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000756 | Grad Max: 0.002420 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022162 | Grad Max: 0.022162 [GRADIENT NORM TOTAL] 11.0133 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.975 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010113 0.49898872] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.084 [MASKS] A(Pass/Fail): 687/1361 | B: 657/1391 | C: 585/1463 [LOSS Ex1] A: 0.63425 | B: 0.60781 | C: 0.60972 [LOGITS Ex2 A] Mean Abs: 2.008 | Max: 5.402 [LOSS Ex2] A: 0.10694 | B: 0.33401 | C: 0.22264 ** [JOINT LOSS] ** : 0.838458 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011106 | Grad Max: 0.307201 -> Layer: shared_layers.0.bias | Grad Mean: 0.935803 | Grad Max: 4.136335 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.006077 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009747 | Grad Max: 0.009747 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006169 | Grad Max: 1.099012 -> Layer: exit2_layers.0.bias | Grad Mean: 0.115017 | Grad Max: 6.086195 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000675 | Grad Max: 0.022288 -> Layer: exit2_layers.3.bias | Grad Mean: 0.063275 | Grad Max: 0.344222 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000092 | Grad Max: 0.000926 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012735 | Grad Max: 0.024862 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000671 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003269 | Grad Max: 0.010557 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001617 | Grad Max: 0.004285 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046703 | Grad Max: 0.046703 [GRADIENT NORM TOTAL] 20.9890 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.871 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54276896 0.45723101] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.081 [MASKS] A(Pass/Fail): 691/1357 | B: 614/1242 | C: 646/1402 [LOSS Ex1] A: 0.63486 | B: 0.61196 | C: 0.60007 [LOGITS Ex2 A] Mean Abs: 2.006 | Max: 5.734 [LOSS Ex2] A: 0.11773 | B: 0.33056 | C: 0.20510 ** [JOINT LOSS] ** : 0.833429 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010899 | Grad Max: 0.282616 -> Layer: shared_layers.0.bias | Grad Mean: 0.859424 | Grad Max: 3.826611 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005656 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004196 | Grad Max: 0.004196 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005676 | Grad Max: 0.903911 -> Layer: exit2_layers.0.bias | Grad Mean: 0.105407 | Grad Max: 5.017396 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000618 | Grad Max: 0.021146 -> Layer: exit2_layers.3.bias | Grad Mean: 0.057871 | Grad Max: 0.291765 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000843 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011759 | Grad Max: 0.023548 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000607 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003016 | Grad Max: 0.010054 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001483 | Grad Max: 0.003936 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042652 | Grad Max: 0.042652 [GRADIENT NORM TOTAL] 19.0446 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.093 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.81127405 0.18872589] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.085 [MASKS] A(Pass/Fail): 751/1297 | B: 660/1388 | C: 579/1469 [LOSS Ex1] A: 0.62834 | B: 0.61181 | C: 0.60500 [LOGITS Ex2 A] Mean Abs: 2.089 | Max: 7.652 [LOSS Ex2] A: 0.10306 | B: 0.32171 | C: 0.20187 ** [JOINT LOSS] ** : 0.823928 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.123583 -> Layer: shared_layers.0.bias | Grad Mean: 0.230572 | Grad Max: 1.661487 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.005871 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005600 | Grad Max: 0.005600 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001394 | Grad Max: 0.536765 -> Layer: exit2_layers.0.bias | Grad Mean: 0.025252 | Grad Max: 3.004112 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000105 | Grad Max: 0.006449 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009797 | Grad Max: 0.074446 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000194 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001869 | Grad Max: 0.005392 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000143 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000442 | Grad Max: 0.001878 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000513 | Grad Max: 0.001555 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005402 | Grad Max: 0.005402 [GRADIENT NORM TOTAL] 6.1012 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.214 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500443 0.49955702] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.084 [MASKS] A(Pass/Fail): 723/1325 | B: 648/1400 | C: 593/1455 [LOSS Ex1] A: 0.63553 | B: 0.61224 | C: 0.61065 [LOGITS Ex2 A] Mean Abs: 2.141 | Max: 6.220 [LOSS Ex2] A: 0.10496 | B: 0.32453 | C: 0.23303 ** [JOINT LOSS] ** : 0.840312 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006456 | Grad Max: 0.288697 -> Layer: shared_layers.0.bias | Grad Mean: 0.753723 | Grad Max: 3.975960 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002023 | Grad Max: 0.005314 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003989 | Grad Max: 0.003989 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004695 | Grad Max: 0.930153 -> Layer: exit2_layers.0.bias | Grad Mean: 0.088261 | Grad Max: 5.173518 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000492 | Grad Max: 0.016125 -> Layer: exit2_layers.3.bias | Grad Mean: 0.046968 | Grad Max: 0.257391 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000662 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009289 | Grad Max: 0.019201 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000483 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002391 | Grad Max: 0.007437 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001149 | Grad Max: 0.003088 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033870 | Grad Max: 0.033870 [GRADIENT NORM TOTAL] 17.3096 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.855 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.73910874 0.2608913 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.085 [MASKS] A(Pass/Fail): 713/1335 | B: 657/1391 | C: 578/1470 [LOSS Ex1] A: 0.63068 | B: 0.60775 | C: 0.60654 [LOGITS Ex2 A] Mean Abs: 2.143 | Max: 6.146 [LOSS Ex2] A: 0.12715 | B: 0.32942 | C: 0.25085 ** [JOINT LOSS] ** : 0.850793 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009078 | Grad Max: 0.432038 -> Layer: shared_layers.0.bias | Grad Mean: 1.090011 | Grad Max: 5.749931 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002166 | Grad Max: 0.005999 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000855 | Grad Max: 0.000855 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006854 | Grad Max: 1.305509 -> Layer: exit2_layers.0.bias | Grad Mean: 0.128257 | Grad Max: 7.270782 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000713 | Grad Max: 0.026677 -> Layer: exit2_layers.3.bias | Grad Mean: 0.068203 | Grad Max: 0.364068 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.000943 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013608 | Grad Max: 0.027081 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000676 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003532 | Grad Max: 0.010933 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001684 | Grad Max: 0.003917 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050259 | Grad Max: 0.050259 [GRADIENT NORM TOTAL] 24.7002 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.989 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6405093 0.35949066] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 590/1026 | B: 614/1242 | C: 402/974 [LOSS Ex1] A: 0.62881 | B: 0.61190 | C: 0.60563 [LOGITS Ex2 A] Mean Abs: 2.181 | Max: 8.448 [LOSS Ex2] A: 0.10544 | B: 0.31132 | C: 0.21571 ** [JOINT LOSS] ** : 0.826273 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007795 | Grad Max: 0.354998 -> Layer: shared_layers.0.bias | Grad Mean: 0.816450 | Grad Max: 4.356618 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.005602 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001070 | Grad Max: 0.001070 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005328 | Grad Max: 0.980862 -> Layer: exit2_layers.0.bias | Grad Mean: 0.099555 | Grad Max: 5.460134 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000554 | Grad Max: 0.018176 -> Layer: exit2_layers.3.bias | Grad Mean: 0.052701 | Grad Max: 0.271809 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000719 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010635 | Grad Max: 0.021004 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000529 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002846 | Grad Max: 0.008001 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001349 | Grad Max: 0.003791 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041549 | Grad Max: 0.041549 [GRADIENT NORM TOTAL] 18.7100 [EPOCH SUMMARY] Train Loss: 0.8394 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8105 | Alpha: 0.5500 No improve count: 4/15 ############################## EPOCH 163/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.214 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50822383 0.4917762 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 723/1325 | B: 660/1388 | C: 588/1460 [LOSS Ex1] A: 0.62957 | B: 0.61175 | C: 0.60804 [LOGITS Ex2 A] Mean Abs: 2.103 | Max: 8.605 [LOSS Ex2] A: 0.10339 | B: 0.31850 | C: 0.21870 ** [JOINT LOSS] ** : 0.829980 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003092 | Grad Max: 0.108462 -> Layer: shared_layers.0.bias | Grad Mean: 0.085631 | Grad Max: 0.545549 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.005461 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000071 | Grad Max: 0.000071 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000706 | Grad Max: 0.197091 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011666 | Grad Max: 1.071632 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.002595 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003179 | Grad Max: 0.033154 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000122 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000563 | Grad Max: 0.002605 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000090 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000152 | Grad Max: 0.000829 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000527 | Grad Max: 0.001527 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001079 | Grad Max: 0.001079 [GRADIENT NORM TOTAL] 2.4782 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.145 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5062189 0.4937811] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 719/1329 | B: 648/1400 | C: 599/1449 [LOSS Ex1] A: 0.62624 | B: 0.61218 | C: 0.60649 [LOGITS Ex2 A] Mean Abs: 2.070 | Max: 6.261 [LOSS Ex2] A: 0.10856 | B: 0.33567 | C: 0.22991 ** [JOINT LOSS] ** : 0.839680 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009117 | Grad Max: 0.278879 -> Layer: shared_layers.0.bias | Grad Mean: 0.791593 | Grad Max: 3.542408 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.006195 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000721 | Grad Max: 0.000721 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005066 | Grad Max: 0.605913 -> Layer: exit2_layers.0.bias | Grad Mean: 0.095113 | Grad Max: 3.371073 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000568 | Grad Max: 0.017940 -> Layer: exit2_layers.3.bias | Grad Mean: 0.053961 | Grad Max: 0.272187 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000815 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010817 | Grad Max: 0.022202 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000559 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002803 | Grad Max: 0.008742 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001392 | Grad Max: 0.003529 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040354 | Grad Max: 0.040354 [GRADIENT NORM TOTAL] 16.8361 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.179 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50967306 0.49032697] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 722/1326 | B: 657/1391 | C: 617/1431 [LOSS Ex1] A: 0.62520 | B: 0.60768 | C: 0.60545 [LOGITS Ex2 A] Mean Abs: 2.016 | Max: 6.276 [LOSS Ex2] A: 0.12606 | B: 0.32711 | C: 0.21305 ** [JOINT LOSS] ** : 0.834851 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007962 | Grad Max: 0.327408 -> Layer: shared_layers.0.bias | Grad Mean: 0.898724 | Grad Max: 4.261260 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.006163 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002788 | Grad Max: 0.002788 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005681 | Grad Max: 0.987189 -> Layer: exit2_layers.0.bias | Grad Mean: 0.106845 | Grad Max: 5.440413 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000638 | Grad Max: 0.022411 -> Layer: exit2_layers.3.bias | Grad Mean: 0.060833 | Grad Max: 0.335769 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.000856 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011997 | Grad Max: 0.024231 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000641 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003111 | Grad Max: 0.010064 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001498 | Grad Max: 0.003917 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044219 | Grad Max: 0.044219 [GRADIENT NORM TOTAL] 19.8597 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.977 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50099266 0.49900737] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.084 [MASKS] A(Pass/Fail): 687/1361 | B: 614/1242 | C: 607/1441 [LOSS Ex1] A: 0.63415 | B: 0.61182 | C: 0.60657 [LOGITS Ex2 A] Mean Abs: 2.039 | Max: 5.242 [LOSS Ex2] A: 0.10581 | B: 0.31334 | C: 0.22078 ** [JOINT LOSS] ** : 0.830821 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004589 | Grad Max: 0.210345 -> Layer: shared_layers.0.bias | Grad Mean: 0.582710 | Grad Max: 2.773826 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.005260 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005593 | Grad Max: 0.005593 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003664 | Grad Max: 0.721975 -> Layer: exit2_layers.0.bias | Grad Mean: 0.068277 | Grad Max: 4.019463 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000416 | Grad Max: 0.015011 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039840 | Grad Max: 0.228691 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000618 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007916 | Grad Max: 0.016753 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000446 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002039 | Grad Max: 0.006702 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000990 | Grad Max: 0.003090 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028832 | Grad Max: 0.028832 [GRADIENT NORM TOTAL] 13.2518 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.873 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5426817 0.4573183] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.081 [MASKS] A(Pass/Fail): 691/1357 | B: 660/1388 | C: 591/1457 [LOSS Ex1] A: 0.63477 | B: 0.61167 | C: 0.60896 [LOGITS Ex2 A] Mean Abs: 2.094 | Max: 6.387 [LOSS Ex2] A: 0.11706 | B: 0.32283 | C: 0.21550 ** [JOINT LOSS] ** : 0.836933 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005293 | Grad Max: 0.167962 -> Layer: shared_layers.0.bias | Grad Mean: 0.332246 | Grad Max: 1.488940 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002050 | Grad Max: 0.005800 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004559 | Grad Max: 0.004559 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.253167 -> Layer: exit2_layers.0.bias | Grad Mean: 0.038781 | Grad Max: 1.423368 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000204 | Grad Max: 0.007066 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018910 | Grad Max: 0.100349 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000350 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003916 | Grad Max: 0.008856 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000239 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001041 | Grad Max: 0.003452 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000512 | Grad Max: 0.002069 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015049 | Grad Max: 0.015049 [GRADIENT NORM TOTAL] 7.1214 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.095 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.81180334 0.18819672] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.085 [MASKS] A(Pass/Fail): 751/1297 | B: 648/1400 | C: 590/1458 [LOSS Ex1] A: 0.62823 | B: 0.61209 | C: 0.60693 [LOGITS Ex2 A] Mean Abs: 2.148 | Max: 6.982 [LOSS Ex2] A: 0.12305 | B: 0.32984 | C: 0.22648 ** [JOINT LOSS] ** : 0.842211 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010983 | Grad Max: 0.297347 -> Layer: shared_layers.0.bias | Grad Mean: 0.708925 | Grad Max: 3.213648 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005753 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000158 | Grad Max: 0.000158 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004495 | Grad Max: 0.592490 -> Layer: exit2_layers.0.bias | Grad Mean: 0.082163 | Grad Max: 3.281419 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000477 | Grad Max: 0.013090 -> Layer: exit2_layers.3.bias | Grad Mean: 0.044499 | Grad Max: 0.212258 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000773 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009206 | Grad Max: 0.019725 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000441 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002401 | Grad Max: 0.007335 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001164 | Grad Max: 0.003109 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033295 | Grad Max: 0.033295 [GRADIENT NORM TOTAL] 14.9356 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.216 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004322 0.49956778] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.084 [MASKS] A(Pass/Fail): 723/1325 | B: 657/1391 | C: 598/1450 [LOSS Ex1] A: 0.63542 | B: 0.60760 | C: 0.60775 [LOGITS Ex2 A] Mean Abs: 2.164 | Max: 6.488 [LOSS Ex2] A: 0.10305 | B: 0.29598 | C: 0.22764 ** [JOINT LOSS] ** : 0.825814 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006213 | Grad Max: 0.146569 -> Layer: shared_layers.0.bias | Grad Mean: 0.409974 | Grad Max: 1.465619 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.005352 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003240 | Grad Max: 0.003240 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002789 | Grad Max: 0.381717 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051732 | Grad Max: 2.072227 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000303 | Grad Max: 0.008924 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028328 | Grad Max: 0.133202 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000445 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005753 | Grad Max: 0.011484 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000282 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001473 | Grad Max: 0.003963 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000721 | Grad Max: 0.002389 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020807 | Grad Max: 0.020807 [GRADIENT NORM TOTAL] 8.8663 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.857 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7396348 0.26036522] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.085 [MASKS] A(Pass/Fail): 713/1335 | B: 614/1242 | C: 604/1444 [LOSS Ex1] A: 0.63057 | B: 0.61174 | C: 0.60357 [LOGITS Ex2 A] Mean Abs: 2.076 | Max: 6.868 [LOSS Ex2] A: 0.11797 | B: 0.31056 | C: 0.21132 ** [JOINT LOSS] ** : 0.828572 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005454 | Grad Max: 0.210866 -> Layer: shared_layers.0.bias | Grad Mean: 0.512914 | Grad Max: 2.717791 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.006038 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003779 | Grad Max: 0.003779 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003221 | Grad Max: 0.616527 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059616 | Grad Max: 3.455055 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.010798 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033997 | Grad Max: 0.172608 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000504 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006814 | Grad Max: 0.013888 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000369 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001739 | Grad Max: 0.006008 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000877 | Grad Max: 0.002876 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024432 | Grad Max: 0.024432 [GRADIENT NORM TOTAL] 11.2735 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.992 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6407597 0.35924035] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.085 [MASKS] A(Pass/Fail): 590/1026 | B: 660/1388 | C: 606/1442 [LOSS Ex1] A: 0.62870 | B: 0.61159 | C: 0.60571 [LOGITS Ex2 A] Mean Abs: 2.116 | Max: 8.251 [LOSS Ex2] A: 0.11647 | B: 0.33116 | C: 0.22854 ** [JOINT LOSS] ** : 0.840722 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008609 | Grad Max: 0.269212 -> Layer: shared_layers.0.bias | Grad Mean: 0.757692 | Grad Max: 3.552843 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.006577 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014296 | Grad Max: 0.014296 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004798 | Grad Max: 0.666473 -> Layer: exit2_layers.0.bias | Grad Mean: 0.088930 | Grad Max: 3.708670 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000532 | Grad Max: 0.015986 -> Layer: exit2_layers.3.bias | Grad Mean: 0.050337 | Grad Max: 0.255185 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000805 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010319 | Grad Max: 0.020394 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000575 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002721 | Grad Max: 0.008481 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001454 | Grad Max: 0.003800 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040729 | Grad Max: 0.040729 [GRADIENT NORM TOTAL] 16.0836 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.217 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50818664 0.49181336] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 723/1325 | B: 648/1400 | C: 597/1451 [LOSS Ex1] A: 0.62944 | B: 0.61201 | C: 0.60593 [LOGITS Ex2 A] Mean Abs: 2.130 | Max: 7.792 [LOSS Ex2] A: 0.09141 | B: 0.31572 | C: 0.20937 ** [JOINT LOSS] ** : 0.821296 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003488 | Grad Max: 0.105456 -> Layer: shared_layers.0.bias | Grad Mean: 0.287392 | Grad Max: 1.253378 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002170 | Grad Max: 0.005803 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004014 | Grad Max: 0.004014 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001754 | Grad Max: 0.227105 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032039 | Grad Max: 1.265995 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000205 | Grad Max: 0.006285 -> Layer: exit2_layers.3.bias | Grad Mean: 0.019215 | Grad Max: 0.097884 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000425 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003878 | Grad Max: 0.008888 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000275 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000987 | Grad Max: 0.003751 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000534 | Grad Max: 0.002465 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013844 | Grad Max: 0.013844 [GRADIENT NORM TOTAL] 5.8009 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.147 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50615484 0.49384516] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.086 [MASKS] A(Pass/Fail): 719/1329 | B: 657/1391 | C: 607/1441 [LOSS Ex1] A: 0.62613 | B: 0.60751 | C: 0.60424 [LOGITS Ex2 A] Mean Abs: 2.192 | Max: 7.013 [LOSS Ex2] A: 0.10852 | B: 0.31383 | C: 0.23272 ** [JOINT LOSS] ** : 0.830982 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007749 | Grad Max: 0.288793 -> Layer: shared_layers.0.bias | Grad Mean: 0.738613 | Grad Max: 3.791044 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002308 | Grad Max: 0.006260 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006687 | Grad Max: 0.006687 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004769 | Grad Max: 0.762336 -> Layer: exit2_layers.0.bias | Grad Mean: 0.088966 | Grad Max: 4.264806 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000509 | Grad Max: 0.015776 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048532 | Grad Max: 0.252863 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000695 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009745 | Grad Max: 0.019478 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000490 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002566 | Grad Max: 0.007532 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001207 | Grad Max: 0.003204 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036072 | Grad Max: 0.036072 [GRADIENT NORM TOTAL] 16.4838 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.182 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50969577 0.4903042 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 722/1326 | B: 614/1242 | C: 593/1455 [LOSS Ex1] A: 0.62508 | B: 0.61164 | C: 0.60609 [LOGITS Ex2 A] Mean Abs: 2.174 | Max: 5.848 [LOSS Ex2] A: 0.12355 | B: 0.32305 | C: 0.24570 ** [JOINT LOSS] ** : 0.845040 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010589 | Grad Max: 0.371576 -> Layer: shared_layers.0.bias | Grad Mean: 1.006228 | Grad Max: 4.936773 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.006152 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001727 | Grad Max: 0.001727 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006392 | Grad Max: 1.132149 -> Layer: exit2_layers.0.bias | Grad Mean: 0.118940 | Grad Max: 6.294788 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000686 | Grad Max: 0.022460 -> Layer: exit2_layers.3.bias | Grad Mean: 0.065383 | Grad Max: 0.333602 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.001015 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013220 | Grad Max: 0.026630 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000688 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003450 | Grad Max: 0.010637 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001662 | Grad Max: 0.003735 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048349 | Grad Max: 0.048349 [GRADIENT NORM TOTAL] 22.2536 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.979 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009275 0.49907246] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.084 [MASKS] A(Pass/Fail): 689/1359 | B: 660/1388 | C: 606/1442 [LOSS Ex1] A: 0.63404 | B: 0.61150 | C: 0.60363 [LOGITS Ex2 A] Mean Abs: 2.139 | Max: 6.029 [LOSS Ex2] A: 0.10618 | B: 0.33382 | C: 0.22185 ** [JOINT LOSS] ** : 0.837005 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007498 | Grad Max: 0.255804 -> Layer: shared_layers.0.bias | Grad Mean: 0.724119 | Grad Max: 3.382624 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.005506 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002988 | Grad Max: 0.002988 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004612 | Grad Max: 0.722329 -> Layer: exit2_layers.0.bias | Grad Mean: 0.086044 | Grad Max: 4.039410 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000511 | Grad Max: 0.016484 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048915 | Grad Max: 0.258978 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000793 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009822 | Grad Max: 0.020555 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000517 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002569 | Grad Max: 0.007718 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001227 | Grad Max: 0.003417 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036018 | Grad Max: 0.036018 [GRADIENT NORM TOTAL] 15.8381 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.876 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54270864 0.4572914 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.081 [MASKS] A(Pass/Fail): 691/1357 | B: 648/1400 | C: 381/995 [LOSS Ex1] A: 0.63467 | B: 0.61192 | C: 0.61159 [LOGITS Ex2 A] Mean Abs: 2.085 | Max: 5.793 [LOSS Ex2] A: 0.11102 | B: 0.31265 | C: 0.24001 ** [JOINT LOSS] ** : 0.840615 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003102 | Grad Max: 0.065737 -> Layer: shared_layers.0.bias | Grad Mean: 0.103561 | Grad Max: 0.464813 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005308 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006595 | Grad Max: 0.006595 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000858 | Grad Max: 0.363393 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014831 | Grad Max: 2.028412 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.002746 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003045 | Grad Max: 0.028806 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000190 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000650 | Grad Max: 0.003309 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000085 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000160 | Grad Max: 0.000799 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000392 | Grad Max: 0.001208 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002053 | Grad Max: 0.002053 [GRADIENT NORM TOTAL] 3.6177 [EPOCH SUMMARY] Train Loss: 0.8346 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8185 | Alpha: 0.5500 No improve count: 5/15 ############################## EPOCH 164/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.097 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8123671 0.18763298] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.085 [MASKS] A(Pass/Fail): 751/1297 | B: 657/1391 | C: 610/1438 [LOSS Ex1] A: 0.62812 | B: 0.60742 | C: 0.60701 [LOGITS Ex2 A] Mean Abs: 2.080 | Max: 7.881 [LOSS Ex2] A: 0.10182 | B: 0.31354 | C: 0.24158 ** [JOINT LOSS] ** : 0.833163 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005939 | Grad Max: 0.196100 -> Layer: shared_layers.0.bias | Grad Mean: 0.541875 | Grad Max: 2.480964 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002219 | Grad Max: 0.005724 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000862 | Grad Max: 0.000862 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003427 | Grad Max: 0.434681 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064313 | Grad Max: 2.456796 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000357 | Grad Max: 0.012342 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033730 | Grad Max: 0.192081 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000536 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006735 | Grad Max: 0.013325 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000373 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001795 | Grad Max: 0.005260 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000982 | Grad Max: 0.002832 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027346 | Grad Max: 0.027346 [GRADIENT NORM TOTAL] 11.4644 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.219 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004076 0.49959242] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.084 [MASKS] A(Pass/Fail): 723/1325 | B: 614/1242 | C: 625/1423 [LOSS Ex1] A: 0.63531 | B: 0.61155 | C: 0.60507 [LOGITS Ex2 A] Mean Abs: 2.104 | Max: 5.872 [LOSS Ex2] A: 0.09123 | B: 0.31035 | C: 0.20752 ** [JOINT LOSS] ** : 0.820343 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005323 | Grad Max: 0.200011 -> Layer: shared_layers.0.bias | Grad Mean: 0.539662 | Grad Max: 2.535846 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002044 | Grad Max: 0.005591 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002440 | Grad Max: 0.002440 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003586 | Grad Max: 0.606624 -> Layer: exit2_layers.0.bias | Grad Mean: 0.066864 | Grad Max: 3.382406 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000379 | Grad Max: 0.011949 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036435 | Grad Max: 0.203069 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000552 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007262 | Grad Max: 0.013996 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000411 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001894 | Grad Max: 0.005656 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000998 | Grad Max: 0.003137 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027735 | Grad Max: 0.027735 [GRADIENT NORM TOTAL] 12.2514 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.859 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.74004775 0.25995222] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.085 [MASKS] A(Pass/Fail): 713/1335 | B: 661/1387 | C: 589/1459 [LOSS Ex1] A: 0.63045 | B: 0.61142 | C: 0.60715 [LOGITS Ex2 A] Mean Abs: 2.116 | Max: 6.855 [LOSS Ex2] A: 0.11469 | B: 0.31607 | C: 0.21096 ** [JOINT LOSS] ** : 0.830247 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003116 | Grad Max: 0.104676 -> Layer: shared_layers.0.bias | Grad Mean: 0.114934 | Grad Max: 0.572253 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002110 | Grad Max: 0.005757 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001981 | Grad Max: 0.001981 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000808 | Grad Max: 0.451691 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013317 | Grad Max: 2.526882 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.003399 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002609 | Grad Max: 0.031904 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000176 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000555 | Grad Max: 0.003323 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000097 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000144 | Grad Max: 0.000924 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000433 | Grad Max: 0.001265 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001475 | Grad Max: 0.001475 [GRADIENT NORM TOTAL] 3.8835 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 0.994 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6408966 0.35910338] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 590/1026 | B: 648/1400 | C: 586/1462 [LOSS Ex1] A: 0.62858 | B: 0.61183 | C: 0.60767 [LOGITS Ex2 A] Mean Abs: 2.205 | Max: 8.710 [LOSS Ex2] A: 0.10143 | B: 0.31831 | C: 0.23265 ** [JOINT LOSS] ** : 0.833485 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004222 | Grad Max: 0.160741 -> Layer: shared_layers.0.bias | Grad Mean: 0.432268 | Grad Max: 2.188075 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002091 | Grad Max: 0.006044 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009738 | Grad Max: 0.009738 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002875 | Grad Max: 0.422913 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053138 | Grad Max: 2.353067 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000290 | Grad Max: 0.009289 -> Layer: exit2_layers.3.bias | Grad Mean: 0.027898 | Grad Max: 0.145468 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000447 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005663 | Grad Max: 0.012592 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000298 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001457 | Grad Max: 0.004542 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000656 | Grad Max: 0.002226 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019521 | Grad Max: 0.019521 [GRADIENT NORM TOTAL] 9.7784 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.220 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081849 0.4918151] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 723/1325 | B: 657/1391 | C: 587/1461 [LOSS Ex1] A: 0.62932 | B: 0.60732 | C: 0.61063 [LOGITS Ex2 A] Mean Abs: 2.167 | Max: 9.040 [LOSS Ex2] A: 0.10415 | B: 0.29953 | C: 0.23146 ** [JOINT LOSS] ** : 0.827466 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006639 | Grad Max: 0.177953 -> Layer: shared_layers.0.bias | Grad Mean: 0.488731 | Grad Max: 2.311571 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.005439 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002440 | Grad Max: 0.002440 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003244 | Grad Max: 0.558571 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059826 | Grad Max: 3.132290 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000337 | Grad Max: 0.010397 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031197 | Grad Max: 0.138081 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000529 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006376 | Grad Max: 0.013633 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000358 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001669 | Grad Max: 0.004777 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000822 | Grad Max: 0.002592 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023512 | Grad Max: 0.023512 [GRADIENT NORM TOTAL] 10.9342 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.151 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.506101 0.49389896] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 719/1329 | B: 614/1242 | C: 597/1451 [LOSS Ex1] A: 0.62600 | B: 0.61145 | C: 0.60648 [LOGITS Ex2 A] Mean Abs: 2.153 | Max: 5.848 [LOSS Ex2] A: 0.10687 | B: 0.29700 | C: 0.22104 ** [JOINT LOSS] ** : 0.822945 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003491 | Grad Max: 0.091703 -> Layer: shared_layers.0.bias | Grad Mean: 0.255295 | Grad Max: 1.259772 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002182 | Grad Max: 0.006156 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002666 | Grad Max: 0.002666 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001577 | Grad Max: 0.567586 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028183 | Grad Max: 3.155018 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.004957 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013779 | Grad Max: 0.077487 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000283 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002751 | Grad Max: 0.007241 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000219 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000676 | Grad Max: 0.002857 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000402 | Grad Max: 0.001828 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009030 | Grad Max: 0.009030 [GRADIENT NORM TOTAL] 6.2643 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.185 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5097777 0.49022225] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 722/1326 | B: 661/1387 | C: 588/1460 [LOSS Ex1] A: 0.62495 | B: 0.61131 | C: 0.60714 [LOGITS Ex2 A] Mean Abs: 2.121 | Max: 5.638 [LOSS Ex2] A: 0.11013 | B: 0.31626 | C: 0.21055 ** [JOINT LOSS] ** : 0.826784 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003192 | Grad Max: 0.160723 -> Layer: shared_layers.0.bias | Grad Mean: 0.384230 | Grad Max: 2.078230 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.005705 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005093 | Grad Max: 0.005093 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002293 | Grad Max: 0.767687 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042531 | Grad Max: 4.236332 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.008640 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021094 | Grad Max: 0.111759 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000337 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004200 | Grad Max: 0.008919 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000271 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001095 | Grad Max: 0.004049 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000545 | Grad Max: 0.002482 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015428 | Grad Max: 0.015428 [GRADIENT NORM TOTAL] 9.2137 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.982 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009243 0.49907577] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.084 [MASKS] A(Pass/Fail): 689/1359 | B: 648/1400 | C: 606/1442 [LOSS Ex1] A: 0.63390 | B: 0.61172 | C: 0.60378 [LOGITS Ex2 A] Mean Abs: 2.124 | Max: 5.912 [LOSS Ex2] A: 0.10631 | B: 0.32399 | C: 0.22463 ** [JOINT LOSS] ** : 0.834775 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003870 | Grad Max: 0.137155 -> Layer: shared_layers.0.bias | Grad Mean: 0.109853 | Grad Max: 0.679223 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.005303 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006940 | Grad Max: 0.006940 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001044 | Grad Max: 0.213301 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017558 | Grad Max: 1.187287 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.002887 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004472 | Grad Max: 0.032090 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000223 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001105 | Grad Max: 0.003833 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000113 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000301 | Grad Max: 0.001241 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000379 | Grad Max: 0.001294 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005071 | Grad Max: 0.005071 [GRADIENT NORM TOTAL] 3.4923 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.136 | Max: 0.878 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54262245 0.45737758] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.081 [MASKS] A(Pass/Fail): 691/1357 | B: 657/1391 | C: 648/1400 [LOSS Ex1] A: 0.63453 | B: 0.60721 | C: 0.59964 [LOGITS Ex2 A] Mean Abs: 2.136 | Max: 5.536 [LOSS Ex2] A: 0.10335 | B: 0.29958 | C: 0.20627 ** [JOINT LOSS] ** : 0.816861 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003889 | Grad Max: 0.196906 -> Layer: shared_layers.0.bias | Grad Mean: 0.416627 | Grad Max: 2.477912 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.006014 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009775 | Grad Max: 0.009775 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002602 | Grad Max: 0.546973 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048364 | Grad Max: 3.037910 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000243 | Grad Max: 0.010081 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023349 | Grad Max: 0.137830 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000390 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004617 | Grad Max: 0.009743 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000267 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001230 | Grad Max: 0.003704 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000598 | Grad Max: 0.002144 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018018 | Grad Max: 0.018018 [GRADIENT NORM TOTAL] 9.7983 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.81294996 0.18705004] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.086 [MASKS] A(Pass/Fail): 751/1297 | B: 614/1242 | C: 592/1456 [LOSS Ex1] A: 0.62798 | B: 0.61132 | C: 0.60644 [LOGITS Ex2 A] Mean Abs: 2.159 | Max: 6.206 [LOSS Ex2] A: 0.10404 | B: 0.29987 | C: 0.21948 ** [JOINT LOSS] ** : 0.823042 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004477 | Grad Max: 0.195419 -> Layer: shared_layers.0.bias | Grad Mean: 0.376711 | Grad Max: 1.957045 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002277 | Grad Max: 0.006277 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011375 | Grad Max: 0.011375 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002442 | Grad Max: 0.611973 -> Layer: exit2_layers.0.bias | Grad Mean: 0.044797 | Grad Max: 3.405525 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000241 | Grad Max: 0.007747 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023025 | Grad Max: 0.122786 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000373 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004721 | Grad Max: 0.010186 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000261 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001262 | Grad Max: 0.003750 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000595 | Grad Max: 0.002178 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018122 | Grad Max: 0.018122 [GRADIENT NORM TOTAL] 9.0116 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.223 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004489 0.4995511] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.084 [MASKS] A(Pass/Fail): 723/1325 | B: 661/1387 | C: 610/1438 [LOSS Ex1] A: 0.63516 | B: 0.61119 | C: 0.60167 [LOGITS Ex2 A] Mean Abs: 2.151 | Max: 5.845 [LOSS Ex2] A: 0.09596 | B: 0.32191 | C: 0.20621 ** [JOINT LOSS] ** : 0.824040 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002653 | Grad Max: 0.119945 -> Layer: shared_layers.0.bias | Grad Mean: 0.278064 | Grad Max: 1.668692 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.005622 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000544 | Grad Max: 0.000544 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001660 | Grad Max: 0.508190 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030264 | Grad Max: 2.770454 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000123 | Grad Max: 0.006061 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011662 | Grad Max: 0.082521 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000201 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002208 | Grad Max: 0.006020 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000138 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000572 | Grad Max: 0.001858 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000424 | Grad Max: 0.001622 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007997 | Grad Max: 0.007997 [GRADIENT NORM TOTAL] 6.9917 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 0.862 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7404907 0.2595093] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.085 [MASKS] A(Pass/Fail): 713/1335 | B: 648/1400 | C: 596/1452 [LOSS Ex1] A: 0.63029 | B: 0.61159 | C: 0.60664 [LOGITS Ex2 A] Mean Abs: 2.123 | Max: 7.062 [LOSS Ex2] A: 0.11611 | B: 0.32569 | C: 0.20762 ** [JOINT LOSS] ** : 0.832646 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005154 | Grad Max: 0.157545 -> Layer: shared_layers.0.bias | Grad Mean: 0.412381 | Grad Max: 2.232842 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005735 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005529 | Grad Max: 0.005529 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002431 | Grad Max: 0.612842 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045444 | Grad Max: 3.439521 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000252 | Grad Max: 0.009006 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023758 | Grad Max: 0.131495 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000374 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004610 | Grad Max: 0.009599 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000249 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001186 | Grad Max: 0.003393 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000657 | Grad Max: 0.002331 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018102 | Grad Max: 0.018102 [GRADIENT NORM TOTAL] 9.3774 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 0.998 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6410931 0.3589069] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.086 [MASKS] A(Pass/Fail): 591/1025 | B: 658/1390 | C: 618/1430 [LOSS Ex1] A: 0.62841 | B: 0.60708 | C: 0.60450 [LOGITS Ex2 A] Mean Abs: 2.197 | Max: 9.500 [LOSS Ex2] A: 0.10225 | B: 0.29914 | C: 0.21967 ** [JOINT LOSS] ** : 0.820346 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002482 | Grad Max: 0.078259 -> Layer: shared_layers.0.bias | Grad Mean: 0.108025 | Grad Max: 0.755609 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.005642 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006387 | Grad Max: 0.006387 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000899 | Grad Max: 0.365488 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015455 | Grad Max: 2.045302 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000041 | Grad Max: 0.001968 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002543 | Grad Max: 0.030703 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000130 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000512 | Grad Max: 0.002721 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000092 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000136 | Grad Max: 0.000789 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000391 | Grad Max: 0.001049 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001172 | Grad Max: 0.001172 [GRADIENT NORM TOTAL] 4.0954 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.225 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.508233 0.491767] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 723/1325 | B: 614/1242 | C: 373/1003 [LOSS Ex1] A: 0.62914 | B: 0.61119 | C: 0.61315 [LOGITS Ex2 A] Mean Abs: 2.190 | Max: 8.465 [LOSS Ex2] A: 0.09638 | B: 0.29462 | C: 0.21287 ** [JOINT LOSS] ** : 0.819123 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004018 | Grad Max: 0.132180 -> Layer: shared_layers.0.bias | Grad Mean: 0.275972 | Grad Max: 1.311898 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.005553 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006493 | Grad Max: 0.006493 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001885 | Grad Max: 0.375996 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033779 | Grad Max: 2.090791 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000185 | Grad Max: 0.006353 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016989 | Grad Max: 0.096069 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000321 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003586 | Grad Max: 0.007774 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000199 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000970 | Grad Max: 0.002857 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000514 | Grad Max: 0.001831 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013974 | Grad Max: 0.013974 [GRADIENT NORM TOTAL] 6.4419 [EPOCH SUMMARY] Train Loss: 0.8261 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8088 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8105 -> New: 0.8088) ############################## EPOCH 165/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.155 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5059911 0.49400893] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 719/1329 | B: 661/1387 | C: 593/1455 [LOSS Ex1] A: 0.62582 | B: 0.61106 | C: 0.60804 [LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.448 [LOSS Ex2] A: 0.10661 | B: 0.31160 | C: 0.20058 ** [JOINT LOSS] ** : 0.821242 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001663 | Grad Max: 0.030680 -> Layer: shared_layers.0.bias | Grad Mean: 0.087666 | Grad Max: 0.548111 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.006348 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000653 | Grad Max: 0.000653 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000745 | Grad Max: 0.128430 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013062 | Grad Max: 0.685926 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.002718 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003927 | Grad Max: 0.026617 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000164 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000722 | Grad Max: 0.003629 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000090 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000178 | Grad Max: 0.001116 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000439 | Grad Max: 0.001337 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002505 | Grad Max: 0.002505 [GRADIENT NORM TOTAL] 2.5043 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.190 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5098793 0.4901207] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 722/1326 | B: 648/1400 | C: 593/1455 [LOSS Ex1] A: 0.62477 | B: 0.61145 | C: 0.60282 [LOGITS Ex2 A] Mean Abs: 2.169 | Max: 7.266 [LOSS Ex2] A: 0.11953 | B: 0.31244 | C: 0.22823 ** [JOINT LOSS] ** : 0.833081 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002209 | Grad Max: 0.054849 -> Layer: shared_layers.0.bias | Grad Mean: 0.120947 | Grad Max: 0.519657 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.005837 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004492 | Grad Max: 0.004492 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001041 | Grad Max: 0.423611 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018517 | Grad Max: 2.340371 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.003416 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005073 | Grad Max: 0.044171 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000194 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000860 | Grad Max: 0.003761 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000091 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000167 | Grad Max: 0.000909 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000339 | Grad Max: 0.000914 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000033 | Grad Max: 0.000033 [GRADIENT NORM TOTAL] 4.4611 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.986 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009221 0.49907792] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.084 [MASKS] A(Pass/Fail): 689/1359 | B: 658/1390 | C: 618/1430 [LOSS Ex1] A: 0.63371 | B: 0.60693 | C: 0.60229 [LOGITS Ex2 A] Mean Abs: 2.155 | Max: 6.698 [LOSS Ex2] A: 0.10536 | B: 0.29674 | C: 0.22596 ** [JOINT LOSS] ** : 0.823662 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004032 | Grad Max: 0.139623 -> Layer: shared_layers.0.bias | Grad Mean: 0.176088 | Grad Max: 0.609772 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.005440 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004941 | Grad Max: 0.004941 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001269 | Grad Max: 0.614647 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022270 | Grad Max: 3.388187 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000110 | Grad Max: 0.003476 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009550 | Grad Max: 0.046395 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000286 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002087 | Grad Max: 0.005838 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000161 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000544 | Grad Max: 0.001933 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000432 | Grad Max: 0.001927 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008619 | Grad Max: 0.008619 [GRADIENT NORM TOTAL] 5.4920 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.881 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54251957 0.45748043] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.081 [MASKS] A(Pass/Fail): 691/1357 | B: 614/1242 | C: 603/1445 [LOSS Ex1] A: 0.63434 | B: 0.61104 | C: 0.60567 [LOGITS Ex2 A] Mean Abs: 2.150 | Max: 6.207 [LOSS Ex2] A: 0.11119 | B: 0.29489 | C: 0.21132 ** [JOINT LOSS] ** : 0.822817 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002989 | Grad Max: 0.077710 -> Layer: shared_layers.0.bias | Grad Mean: 0.126941 | Grad Max: 0.688593 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.005312 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006353 | Grad Max: 0.006353 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000867 | Grad Max: 0.217440 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014729 | Grad Max: 1.176293 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002126 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002155 | Grad Max: 0.018681 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000158 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000406 | Grad Max: 0.002267 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000085 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000113 | Grad Max: 0.000933 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000443 | Grad Max: 0.001281 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000653 | Grad Max: 0.000653 [GRADIENT NORM TOTAL] 3.5164 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.106 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.81388503 0.18611501] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.086 [MASKS] A(Pass/Fail): 751/1297 | B: 661/1387 | C: 635/1413 [LOSS Ex1] A: 0.62777 | B: 0.61091 | C: 0.60057 [LOGITS Ex2 A] Mean Abs: 2.190 | Max: 7.238 [LOSS Ex2] A: 0.09986 | B: 0.31861 | C: 0.23859 ** [JOINT LOSS] ** : 0.832104 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003758 | Grad Max: 0.113221 -> Layer: shared_layers.0.bias | Grad Mean: 0.196023 | Grad Max: 1.164450 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002214 | Grad Max: 0.006358 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004428 | Grad Max: 0.004428 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001134 | Grad Max: 0.378015 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020246 | Grad Max: 2.113435 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000092 | Grad Max: 0.004500 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008086 | Grad Max: 0.065995 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000204 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001527 | Grad Max: 0.004676 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000141 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000334 | Grad Max: 0.001639 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000275 | Grad Max: 0.000859 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003065 | Grad Max: 0.003065 [GRADIENT NORM TOTAL] 4.5944 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.228 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50052 0.49948004] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.084 [MASKS] A(Pass/Fail): 723/1325 | B: 648/1400 | C: 607/1441 [LOSS Ex1] A: 0.63495 | B: 0.61128 | C: 0.61073 [LOGITS Ex2 A] Mean Abs: 2.237 | Max: 6.119 [LOSS Ex2] A: 0.09944 | B: 0.31433 | C: 0.22222 ** [JOINT LOSS] ** : 0.830988 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002540 | Grad Max: 0.094411 -> Layer: shared_layers.0.bias | Grad Mean: 0.196681 | Grad Max: 1.089365 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.005347 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006789 | Grad Max: 0.006789 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001192 | Grad Max: 0.498563 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020954 | Grad Max: 2.734081 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.003104 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006783 | Grad Max: 0.044729 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000182 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001473 | Grad Max: 0.004311 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000107 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000430 | Grad Max: 0.001520 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000356 | Grad Max: 0.001617 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007778 | Grad Max: 0.007778 [GRADIENT NORM TOTAL] 5.6796 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 0.866 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.74126816 0.2587318 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.086 [MASKS] A(Pass/Fail): 713/1335 | B: 658/1390 | C: 590/1458 [LOSS Ex1] A: 0.63005 | B: 0.60676 | C: 0.60476 [LOGITS Ex2 A] Mean Abs: 2.207 | Max: 6.699 [LOSS Ex2] A: 0.11531 | B: 0.29406 | C: 0.20300 ** [JOINT LOSS] ** : 0.817983 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002566 | Grad Max: 0.061146 -> Layer: shared_layers.0.bias | Grad Mean: 0.137711 | Grad Max: 0.693674 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002187 | Grad Max: 0.005569 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005672 | Grad Max: 0.005672 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000976 | Grad Max: 0.493469 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017139 | Grad Max: 2.751652 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000043 | Grad Max: 0.002855 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002728 | Grad Max: 0.033795 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000128 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000455 | Grad Max: 0.002350 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000072 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000122 | Grad Max: 0.000798 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000446 | Grad Max: 0.001290 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000937 | Grad Max: 0.000937 [GRADIENT NORM TOTAL] 5.1120 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.003 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64144945 0.35855052] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.086 [MASKS] A(Pass/Fail): 591/1025 | B: 614/1242 | C: 592/1456 [LOSS Ex1] A: 0.62816 | B: 0.61086 | C: 0.60626 [LOGITS Ex2 A] Mean Abs: 2.257 | Max: 10.279 [LOSS Ex2] A: 0.10516 | B: 0.29403 | C: 0.19966 ** [JOINT LOSS] ** : 0.814709 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004052 | Grad Max: 0.124344 -> Layer: shared_layers.0.bias | Grad Mean: 0.135013 | Grad Max: 0.528847 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.005800 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002205 | Grad Max: 0.002205 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000987 | Grad Max: 0.401041 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016750 | Grad Max: 2.247566 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000053 | Grad Max: 0.002520 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003148 | Grad Max: 0.024058 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000187 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000726 | Grad Max: 0.002902 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000099 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000221 | Grad Max: 0.001287 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000453 | Grad Max: 0.001476 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003309 | Grad Max: 0.003309 [GRADIENT NORM TOTAL] 4.2714 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.231 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082675 0.4917325] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.087 [MASKS] A(Pass/Fail): 724/1324 | B: 667/1381 | C: 585/1463 [LOSS Ex1] A: 0.62890 | B: 0.61073 | C: 0.60806 [LOGITS Ex2 A] Mean Abs: 2.253 | Max: 8.674 [LOSS Ex2] A: 0.08996 | B: 0.32067 | C: 0.19978 ** [JOINT LOSS] ** : 0.819369 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004181 | Grad Max: 0.161981 -> Layer: shared_layers.0.bias | Grad Mean: 0.097453 | Grad Max: 0.946957 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.006205 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004879 | Grad Max: 0.004879 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000845 | Grad Max: 0.170877 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013721 | Grad Max: 0.952561 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.002498 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003853 | Grad Max: 0.022960 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000225 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000944 | Grad Max: 0.003488 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000146 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000264 | Grad Max: 0.001495 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000429 | Grad Max: 0.001504 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003918 | Grad Max: 0.003918 [GRADIENT NORM TOTAL] 2.7431 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.160 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50583863 0.49416137] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 719/1329 | B: 649/1399 | C: 559/1489 [LOSS Ex1] A: 0.62557 | B: 0.61110 | C: 0.61389 [LOGITS Ex2 A] Mean Abs: 2.240 | Max: 6.121 [LOSS Ex2] A: 0.10323 | B: 0.30822 | C: 0.21014 ** [JOINT LOSS] ** : 0.824044 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003396 | Grad Max: 0.099855 -> Layer: shared_layers.0.bias | Grad Mean: 0.153062 | Grad Max: 0.917433 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.006200 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005882 | Grad Max: 0.005882 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001061 | Grad Max: 0.211329 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018994 | Grad Max: 1.178789 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.004206 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010012 | Grad Max: 0.055708 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000241 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002065 | Grad Max: 0.005538 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000186 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000539 | Grad Max: 0.002125 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000373 | Grad Max: 0.001813 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007985 | Grad Max: 0.007985 [GRADIENT NORM TOTAL] 3.4590 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.196 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5099835 0.4900165] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.565 | Std: 0.086 [MASKS] A(Pass/Fail): 722/1326 | B: 659/1389 | C: 601/1447 [LOSS Ex1] A: 0.62451 | B: 0.60656 | C: 0.60327 [LOGITS Ex2 A] Mean Abs: 2.247 | Max: 9.036 [LOSS Ex2] A: 0.10867 | B: 0.29261 | C: 0.22687 ** [JOINT LOSS] ** : 0.820829 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003568 | Grad Max: 0.114802 -> Layer: shared_layers.0.bias | Grad Mean: 0.240379 | Grad Max: 1.571618 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002272 | Grad Max: 0.005788 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002887 | Grad Max: 0.002887 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001576 | Grad Max: 0.354677 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028293 | Grad Max: 1.982871 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000104 | Grad Max: 0.006943 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009709 | Grad Max: 0.087265 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000195 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001715 | Grad Max: 0.005368 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000118 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000403 | Grad Max: 0.001337 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000344 | Grad Max: 0.001025 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004523 | Grad Max: 0.004523 [GRADIENT NORM TOTAL] 6.0692 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.991 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500871 0.499129] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.562 | Std: 0.085 [MASKS] A(Pass/Fail): 690/1358 | B: 617/1239 | C: 617/1431 [LOSS Ex1] A: 0.63345 | B: 0.61064 | C: 0.60666 [LOGITS Ex2 A] Mean Abs: 2.208 | Max: 5.731 [LOSS Ex2] A: 0.11000 | B: 0.29226 | C: 0.20546 ** [JOINT LOSS] ** : 0.819494 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006023 | Grad Max: 0.245768 -> Layer: shared_layers.0.bias | Grad Mean: 0.188552 | Grad Max: 0.862962 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.005354 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005945 | Grad Max: 0.005945 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001436 | Grad Max: 0.356433 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023326 | Grad Max: 1.967425 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000076 | Grad Max: 0.003210 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003635 | Grad Max: 0.044752 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000281 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000393 | Grad Max: 0.002702 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000086 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000129 | Grad Max: 0.000751 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000401 | Grad Max: 0.001216 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001743 | Grad Max: 0.001743 [GRADIENT NORM TOTAL] 4.8925 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.886 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5424239 0.4575761] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.081 [MASKS] A(Pass/Fail): 692/1356 | B: 667/1381 | C: 641/1407 [LOSS Ex1] A: 0.63409 | B: 0.61053 | C: 0.60606 [LOGITS Ex2 A] Mean Abs: 2.207 | Max: 6.458 [LOSS Ex2] A: 0.10842 | B: 0.30814 | C: 0.22043 ** [JOINT LOSS] ** : 0.829224 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004294 | Grad Max: 0.124990 -> Layer: shared_layers.0.bias | Grad Mean: 0.164925 | Grad Max: 0.779219 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002065 | Grad Max: 0.005614 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009825 | Grad Max: 0.009825 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001083 | Grad Max: 0.460747 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018740 | Grad Max: 2.542868 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000067 | Grad Max: 0.003414 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005069 | Grad Max: 0.043743 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000223 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001203 | Grad Max: 0.004096 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000099 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000336 | Grad Max: 0.001224 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000310 | Grad Max: 0.001475 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005598 | Grad Max: 0.005598 [GRADIENT NORM TOTAL] 4.6616 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.112 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.81513786 0.18486212] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.086 [MASKS] A(Pass/Fail): 751/1297 | B: 650/1398 | C: 461/915 [LOSS Ex1] A: 0.62751 | B: 0.61089 | C: 0.59567 [LOGITS Ex2 A] Mean Abs: 2.261 | Max: 7.345 [LOSS Ex2] A: 0.09802 | B: 0.31014 | C: 0.23002 ** [JOINT LOSS] ** : 0.824085 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003385 | Grad Max: 0.085529 -> Layer: shared_layers.0.bias | Grad Mean: 0.164700 | Grad Max: 0.604367 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002254 | Grad Max: 0.006159 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007307 | Grad Max: 0.007307 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001182 | Grad Max: 0.388813 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020645 | Grad Max: 2.190099 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000082 | Grad Max: 0.003573 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006794 | Grad Max: 0.041048 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000244 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001414 | Grad Max: 0.005001 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000140 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000319 | Grad Max: 0.001673 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000270 | Grad Max: 0.001066 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002703 | Grad Max: 0.002703 [GRADIENT NORM TOTAL] 4.4975 [EPOCH SUMMARY] Train Loss: 0.8238 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8064 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8088 -> New: 0.8064) ############################## EPOCH 166/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.235 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50052404 0.49947596] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 722/1326 | B: 660/1388 | C: 603/1445 [LOSS Ex1] A: 0.63470 | B: 0.60636 | C: 0.60262 [LOGITS Ex2 A] Mean Abs: 2.276 | Max: 6.837 [LOSS Ex2] A: 0.08856 | B: 0.29370 | C: 0.20193 ** [JOINT LOSS] ** : 0.809292 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003196 | Grad Max: 0.080145 -> Layer: shared_layers.0.bias | Grad Mean: 0.149166 | Grad Max: 0.726276 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.005754 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002117 | Grad Max: 0.002117 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000920 | Grad Max: 0.400234 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015802 | Grad Max: 2.241073 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000051 | Grad Max: 0.003383 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003900 | Grad Max: 0.042421 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000113 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000695 | Grad Max: 0.003391 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000115 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000182 | Grad Max: 0.001434 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000414 | Grad Max: 0.001377 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001949 | Grad Max: 0.001949 [GRADIENT NORM TOTAL] 4.1309 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 0.871 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7422363 0.25776365] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.086 [MASKS] A(Pass/Fail): 713/1335 | B: 617/1239 | C: 628/1420 [LOSS Ex1] A: 0.62978 | B: 0.61044 | C: 0.60329 [LOGITS Ex2 A] Mean Abs: 2.267 | Max: 6.506 [LOSS Ex2] A: 0.11356 | B: 0.28953 | C: 0.18257 ** [JOINT LOSS] ** : 0.809723 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003570 | Grad Max: 0.126265 -> Layer: shared_layers.0.bias | Grad Mean: 0.188555 | Grad Max: 0.960342 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.005738 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002307 | Grad Max: 0.002307 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001163 | Grad Max: 0.431720 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020023 | Grad Max: 2.395000 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000077 | Grad Max: 0.003648 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006411 | Grad Max: 0.052483 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000146 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000999 | Grad Max: 0.003688 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000093 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000225 | Grad Max: 0.000993 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000408 | Grad Max: 0.001074 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001929 | Grad Max: 0.001929 [GRADIENT NORM TOTAL] 5.0766 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.009 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64186555 0.35813445] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.086 [MASKS] A(Pass/Fail): 591/1025 | B: 667/1381 | C: 638/1410 [LOSS Ex1] A: 0.62788 | B: 0.61032 | C: 0.60847 [LOGITS Ex2 A] Mean Abs: 2.330 | Max: 10.246 [LOSS Ex2] A: 0.10159 | B: 0.31076 | C: 0.23924 ** [JOINT LOSS] ** : 0.832757 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004232 | Grad Max: 0.106788 -> Layer: shared_layers.0.bias | Grad Mean: 0.156348 | Grad Max: 1.288900 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005834 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007275 | Grad Max: 0.007275 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001334 | Grad Max: 0.465160 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023472 | Grad Max: 2.560962 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000103 | Grad Max: 0.003513 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009273 | Grad Max: 0.050557 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000296 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001851 | Grad Max: 0.005884 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000189 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000393 | Grad Max: 0.002120 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000194 | Grad Max: 0.000780 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002974 | Grad Max: 0.002974 [GRADIENT NORM TOTAL] 5.2305 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.237 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082321 0.49176785] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.087 [MASKS] A(Pass/Fail): 724/1324 | B: 650/1398 | C: 603/1445 [LOSS Ex1] A: 0.62862 | B: 0.61068 | C: 0.60568 [LOGITS Ex2 A] Mean Abs: 2.330 | Max: 8.299 [LOSS Ex2] A: 0.09437 | B: 0.30696 | C: 0.21557 ** [JOINT LOSS] ** : 0.820628 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003253 | Grad Max: 0.122114 -> Layer: shared_layers.0.bias | Grad Mean: 0.163416 | Grad Max: 0.596247 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.005743 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002257 | Grad Max: 0.002257 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001319 | Grad Max: 0.425152 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022800 | Grad Max: 2.332852 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000079 | Grad Max: 0.003411 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006650 | Grad Max: 0.031648 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000259 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001465 | Grad Max: 0.004780 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000141 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000322 | Grad Max: 0.001479 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000349 | Grad Max: 0.001201 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002855 | Grad Max: 0.002855 [GRADIENT NORM TOTAL] 5.3140 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.166 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50573266 0.4942673 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 719/1329 | B: 661/1387 | C: 638/1410 [LOSS Ex1] A: 0.62528 | B: 0.60615 | C: 0.60623 [LOGITS Ex2 A] Mean Abs: 2.302 | Max: 5.833 [LOSS Ex2] A: 0.09138 | B: 0.29204 | C: 0.21262 ** [JOINT LOSS] ** : 0.811231 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003466 | Grad Max: 0.104421 -> Layer: shared_layers.0.bias | Grad Mean: 0.159389 | Grad Max: 0.733474 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002261 | Grad Max: 0.006213 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006149 | Grad Max: 0.006149 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001271 | Grad Max: 0.145838 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022982 | Grad Max: 0.801062 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000122 | Grad Max: 0.005980 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011383 | Grad Max: 0.078338 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000245 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002268 | Grad Max: 0.005392 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000146 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000581 | Grad Max: 0.001971 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000362 | Grad Max: 0.001643 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008705 | Grad Max: 0.008705 [GRADIENT NORM TOTAL] 3.8827 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.202 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.510097 0.489903] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 722/1326 | B: 619/1237 | C: 597/1451 [LOSS Ex1] A: 0.62423 | B: 0.61022 | C: 0.60750 [LOGITS Ex2 A] Mean Abs: 2.288 | Max: 6.635 [LOSS Ex2] A: 0.10620 | B: 0.29568 | C: 0.21115 ** [JOINT LOSS] ** : 0.818322 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002518 | Grad Max: 0.062965 -> Layer: shared_layers.0.bias | Grad Mean: 0.140441 | Grad Max: 0.716237 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.006329 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002549 | Grad Max: 0.002549 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000802 | Grad Max: 0.229499 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013924 | Grad Max: 1.292447 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.002839 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002838 | Grad Max: 0.031494 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000137 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000472 | Grad Max: 0.002989 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000089 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000110 | Grad Max: 0.000708 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000343 | Grad Max: 0.001028 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000111 | Grad Max: 0.000111 [GRADIENT NORM TOTAL] 3.4228 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.996 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50081146 0.4991885 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 690/1358 | B: 667/1381 | C: 611/1437 [LOSS Ex1] A: 0.63318 | B: 0.61011 | C: 0.59801 [LOGITS Ex2 A] Mean Abs: 2.267 | Max: 6.304 [LOSS Ex2] A: 0.09299 | B: 0.30658 | C: 0.21276 ** [JOINT LOSS] ** : 0.817877 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003488 | Grad Max: 0.149578 -> Layer: shared_layers.0.bias | Grad Mean: 0.128225 | Grad Max: 0.474835 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002143 | Grad Max: 0.005877 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007688 | Grad Max: 0.007688 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000947 | Grad Max: 0.616392 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015572 | Grad Max: 3.401599 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.002607 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002239 | Grad Max: 0.022307 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000172 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000416 | Grad Max: 0.002571 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000076 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000115 | Grad Max: 0.000699 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000384 | Grad Max: 0.001333 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002485 | Grad Max: 0.002485 [GRADIENT NORM TOTAL] 5.0592 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.137 | Max: 0.891 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5423557 0.4576443] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.082 [MASKS] A(Pass/Fail): 692/1356 | B: 650/1398 | C: 571/1477 [LOSS Ex1] A: 0.63383 | B: 0.61046 | C: 0.60643 [LOGITS Ex2 A] Mean Abs: 2.258 | Max: 6.167 [LOSS Ex2] A: 0.10415 | B: 0.30384 | C: 0.20569 ** [JOINT LOSS] ** : 0.821467 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003826 | Grad Max: 0.117314 -> Layer: shared_layers.0.bias | Grad Mean: 0.207238 | Grad Max: 0.881619 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002051 | Grad Max: 0.005518 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007365 | Grad Max: 0.007365 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001556 | Grad Max: 0.543941 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027432 | Grad Max: 3.009562 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000095 | Grad Max: 0.003778 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008320 | Grad Max: 0.055575 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000010 | Grad Max: 0.000179 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001394 | Grad Max: 0.004420 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000094 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000348 | Grad Max: 0.001239 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000382 | Grad Max: 0.001167 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004152 | Grad Max: 0.004152 [GRADIENT NORM TOTAL] 6.0043 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.117 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8164772 0.1835228] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.086 [MASKS] A(Pass/Fail): 751/1297 | B: 661/1387 | C: 634/1414 [LOSS Ex1] A: 0.62723 | B: 0.60593 | C: 0.60501 [LOGITS Ex2 A] Mean Abs: 2.311 | Max: 6.671 [LOSS Ex2] A: 0.09455 | B: 0.28826 | C: 0.22662 ** [JOINT LOSS] ** : 0.815868 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002375 | Grad Max: 0.053702 -> Layer: shared_layers.0.bias | Grad Mean: 0.114948 | Grad Max: 0.775124 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.005324 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003395 | Grad Max: 0.003395 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001059 | Grad Max: 0.336733 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019067 | Grad Max: 1.865715 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000084 | Grad Max: 0.003839 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007484 | Grad Max: 0.044482 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000190 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001376 | Grad Max: 0.005116 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000117 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000292 | Grad Max: 0.001440 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000320 | Grad Max: 0.001007 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003075 | Grad Max: 0.003075 [GRADIENT NORM TOTAL] 4.0564 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.241 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50051236 0.4994876 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 722/1326 | B: 619/1237 | C: 633/1415 [LOSS Ex1] A: 0.63444 | B: 0.60999 | C: 0.60644 [LOGITS Ex2 A] Mean Abs: 2.289 | Max: 7.537 [LOSS Ex2] A: 0.09078 | B: 0.30284 | C: 0.21090 ** [JOINT LOSS] ** : 0.818463 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006093 | Grad Max: 0.184705 -> Layer: shared_layers.0.bias | Grad Mean: 0.557106 | Grad Max: 2.530241 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.004872 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002925 | Grad Max: 0.002925 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003391 | Grad Max: 0.560397 -> Layer: exit2_layers.0.bias | Grad Mean: 0.062181 | Grad Max: 3.151563 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000361 | Grad Max: 0.011298 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034354 | Grad Max: 0.166618 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000568 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006762 | Grad Max: 0.015141 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000362 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001628 | Grad Max: 0.005054 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000861 | Grad Max: 0.002642 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023196 | Grad Max: 0.023196 [GRADIENT NORM TOTAL] 11.9349 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 0.876 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7432612 0.25673878] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.086 [MASKS] A(Pass/Fail): 713/1335 | B: 667/1381 | C: 627/1421 [LOSS Ex1] A: 0.62951 | B: 0.60991 | C: 0.60489 [LOGITS Ex2 A] Mean Abs: 2.272 | Max: 7.594 [LOSS Ex2] A: 0.11568 | B: 0.31632 | C: 0.23296 ** [JOINT LOSS] ** : 0.836422 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003529 | Grad Max: 0.172563 -> Layer: shared_layers.0.bias | Grad Mean: 0.411223 | Grad Max: 2.212615 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002150 | Grad Max: 0.005968 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004934 | Grad Max: 0.004934 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002498 | Grad Max: 0.582701 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045826 | Grad Max: 3.256716 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000237 | Grad Max: 0.009116 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022926 | Grad Max: 0.121818 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000364 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004362 | Grad Max: 0.009473 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000226 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001113 | Grad Max: 0.003412 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000562 | Grad Max: 0.001608 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016839 | Grad Max: 0.016839 [GRADIENT NORM TOTAL] 9.5616 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.015 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64240223 0.35759777] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.087 [MASKS] A(Pass/Fail): 590/1026 | B: 650/1398 | C: 646/1402 [LOSS Ex1] A: 0.62761 | B: 0.61026 | C: 0.59876 [LOGITS Ex2 A] Mean Abs: 2.344 | Max: 11.826 [LOSS Ex2] A: 0.10439 | B: 0.31554 | C: 0.22951 ** [JOINT LOSS] ** : 0.828691 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004759 | Grad Max: 0.149250 -> Layer: shared_layers.0.bias | Grad Mean: 0.310833 | Grad Max: 1.239121 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.005792 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007274 | Grad Max: 0.007274 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002422 | Grad Max: 0.604734 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043598 | Grad Max: 3.343949 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000213 | Grad Max: 0.006333 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020172 | Grad Max: 0.100328 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000343 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004094 | Grad Max: 0.009300 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000306 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000973 | Grad Max: 0.004350 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001494 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011621 | Grad Max: 0.011621 [GRADIENT NORM TOTAL] 8.1191 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.243 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50822777 0.4917723 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.087 [MASKS] A(Pass/Fail): 724/1324 | B: 662/1386 | C: 590/1458 [LOSS Ex1] A: 0.62836 | B: 0.60574 | C: 0.60778 [LOGITS Ex2 A] Mean Abs: 2.327 | Max: 9.094 [LOSS Ex2] A: 0.09374 | B: 0.28338 | C: 0.19103 ** [JOINT LOSS] ** : 0.803344 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006148 | Grad Max: 0.210924 -> Layer: shared_layers.0.bias | Grad Mean: 0.409164 | Grad Max: 2.504673 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.005705 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000266 | Grad Max: 0.000266 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002706 | Grad Max: 0.572337 -> Layer: exit2_layers.0.bias | Grad Mean: 0.048821 | Grad Max: 3.166521 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000241 | Grad Max: 0.007366 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022508 | Grad Max: 0.111783 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000395 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004784 | Grad Max: 0.010344 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000251 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001269 | Grad Max: 0.003461 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000607 | Grad Max: 0.002330 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018457 | Grad Max: 0.018457 [GRADIENT NORM TOTAL] 9.5495 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.173 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056154 0.49438456] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.087 [MASKS] A(Pass/Fail): 720/1328 | B: 619/1237 | C: 391/985 [LOSS Ex1] A: 0.62502 | B: 0.60980 | C: 0.60937 [LOGITS Ex2 A] Mean Abs: 2.304 | Max: 7.187 [LOSS Ex2] A: 0.09566 | B: 0.29434 | C: 0.20022 ** [JOINT LOSS] ** : 0.811468 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002610 | Grad Max: 0.079053 -> Layer: shared_layers.0.bias | Grad Mean: 0.151652 | Grad Max: 0.890185 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002208 | Grad Max: 0.006356 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006943 | Grad Max: 0.006943 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001281 | Grad Max: 0.179107 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022675 | Grad Max: 0.989885 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000120 | Grad Max: 0.004997 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010880 | Grad Max: 0.079939 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000217 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001849 | Grad Max: 0.005905 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000145 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000372 | Grad Max: 0.001565 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000407 | Grad Max: 0.001498 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004851 | Grad Max: 0.004851 [GRADIENT NORM TOTAL] 3.8489 [EPOCH SUMMARY] Train Loss: 0.8183 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8008 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8064 -> New: 0.8008) ############################## EPOCH 167/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.208 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5101742 0.48982576] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 722/1326 | B: 667/1381 | C: 594/1454 [LOSS Ex1] A: 0.62397 | B: 0.60973 | C: 0.60640 [LOGITS Ex2 A] Mean Abs: 2.296 | Max: 8.152 [LOSS Ex2] A: 0.10755 | B: 0.31635 | C: 0.20890 ** [JOINT LOSS] ** : 0.824300 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002403 | Grad Max: 0.064545 -> Layer: shared_layers.0.bias | Grad Mean: 0.095673 | Grad Max: 0.401898 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.006096 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000449 | Grad Max: 0.000449 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000904 | Grad Max: 0.217278 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015605 | Grad Max: 1.205248 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.004254 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002811 | Grad Max: 0.038800 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000142 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000412 | Grad Max: 0.002903 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000076 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000103 | Grad Max: 0.000672 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000306 | Grad Max: 0.000895 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001367 | Grad Max: 0.001367 [GRADIENT NORM TOTAL] 3.3078 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.001 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007374 0.4992626] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 690/1358 | B: 650/1398 | C: 601/1447 [LOSS Ex1] A: 0.63293 | B: 0.61008 | C: 0.60971 [LOGITS Ex2 A] Mean Abs: 2.280 | Max: 7.258 [LOSS Ex2] A: 0.09835 | B: 0.30574 | C: 0.21551 ** [JOINT LOSS] ** : 0.824106 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004456 | Grad Max: 0.110807 -> Layer: shared_layers.0.bias | Grad Mean: 0.245933 | Grad Max: 1.224875 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001984 | Grad Max: 0.005503 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006957 | Grad Max: 0.006957 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001861 | Grad Max: 0.252478 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033079 | Grad Max: 1.381093 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000140 | Grad Max: 0.007691 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012862 | Grad Max: 0.111598 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000206 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002241 | Grad Max: 0.006577 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000143 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000507 | Grad Max: 0.001826 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000370 | Grad Max: 0.001174 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005711 | Grad Max: 0.005711 [GRADIENT NORM TOTAL] 5.9135 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.895 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54233676 0.45766327] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.561 | Std: 0.082 [MASKS] A(Pass/Fail): 693/1355 | B: 662/1386 | C: 623/1425 [LOSS Ex1] A: 0.63360 | B: 0.60555 | C: 0.60607 [LOGITS Ex2 A] Mean Abs: 2.264 | Max: 6.194 [LOSS Ex2] A: 0.11220 | B: 0.27957 | C: 0.21912 ** [JOINT LOSS] ** : 0.818703 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005722 | Grad Max: 0.222514 -> Layer: shared_layers.0.bias | Grad Mean: 0.141919 | Grad Max: 0.617660 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002113 | Grad Max: 0.005216 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005183 | Grad Max: 0.005183 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001162 | Grad Max: 0.385266 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018820 | Grad Max: 2.164001 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000070 | Grad Max: 0.003469 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004028 | Grad Max: 0.028385 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000228 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000933 | Grad Max: 0.003706 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000114 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000194 | Grad Max: 0.001026 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000357 | Grad Max: 0.001444 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003220 | Grad Max: 0.003220 [GRADIENT NORM TOTAL] 4.2841 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.123 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.81762815 0.18237184] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.087 [MASKS] A(Pass/Fail): 751/1297 | B: 619/1237 | C: 611/1437 [LOSS Ex1] A: 0.62700 | B: 0.60961 | C: 0.60709 [LOGITS Ex2 A] Mean Abs: 2.294 | Max: 6.718 [LOSS Ex2] A: 0.09615 | B: 0.28633 | C: 0.21470 ** [JOINT LOSS] ** : 0.813623 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.082713 -> Layer: shared_layers.0.bias | Grad Mean: 0.224596 | Grad Max: 1.200093 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002136 | Grad Max: 0.005472 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002559 | Grad Max: 0.002559 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001321 | Grad Max: 0.630204 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023953 | Grad Max: 3.495461 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000098 | Grad Max: 0.004291 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009082 | Grad Max: 0.042622 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000197 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001760 | Grad Max: 0.005543 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000154 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000400 | Grad Max: 0.002061 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000344 | Grad Max: 0.001616 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005902 | Grad Max: 0.005902 [GRADIENT NORM TOTAL] 6.1667 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.247 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004655 0.49953443] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.085 [MASKS] A(Pass/Fail): 723/1325 | B: 667/1381 | C: 620/1428 [LOSS Ex1] A: 0.63422 | B: 0.60954 | C: 0.60397 [LOGITS Ex2 A] Mean Abs: 2.325 | Max: 6.814 [LOSS Ex2] A: 0.09521 | B: 0.31391 | C: 0.20669 ** [JOINT LOSS] ** : 0.821183 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003471 | Grad Max: 0.075288 -> Layer: shared_layers.0.bias | Grad Mean: 0.168750 | Grad Max: 0.760853 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.005459 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000786 | Grad Max: 0.000786 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001449 | Grad Max: 0.554126 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026768 | Grad Max: 3.083837 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.006376 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013195 | Grad Max: 0.086450 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000227 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002505 | Grad Max: 0.006619 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000191 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000601 | Grad Max: 0.002326 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000350 | Grad Max: 0.001472 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007569 | Grad Max: 0.007569 [GRADIENT NORM TOTAL] 5.5283 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 0.881 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7440529 0.2559471] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.087 [MASKS] A(Pass/Fail): 714/1334 | B: 650/1398 | C: 694/1354 [LOSS Ex1] A: 0.62928 | B: 0.60989 | C: 0.59313 [LOGITS Ex2 A] Mean Abs: 2.305 | Max: 7.369 [LOSS Ex2] A: 0.10916 | B: 0.30541 | C: 0.19016 ** [JOINT LOSS] ** : 0.812342 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002654 | Grad Max: 0.077930 -> Layer: shared_layers.0.bias | Grad Mean: 0.186673 | Grad Max: 0.955453 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.005791 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005584 | Grad Max: 0.005584 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001035 | Grad Max: 0.702375 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018109 | Grad Max: 3.898429 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.003460 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004117 | Grad Max: 0.032318 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000151 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000650 | Grad Max: 0.003290 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000100 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000138 | Grad Max: 0.000778 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000379 | Grad Max: 0.001221 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001583 | Grad Max: 0.001583 [GRADIENT NORM TOTAL] 5.8278 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.021 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6428182 0.35718176] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.087 [MASKS] A(Pass/Fail): 590/1026 | B: 662/1386 | C: 606/1442 [LOSS Ex1] A: 0.62739 | B: 0.60536 | C: 0.60702 [LOGITS Ex2 A] Mean Abs: 2.368 | Max: 8.579 [LOSS Ex2] A: 0.09596 | B: 0.27734 | C: 0.23604 ** [JOINT LOSS] ** : 0.816374 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003089 | Grad Max: 0.065351 -> Layer: shared_layers.0.bias | Grad Mean: 0.093690 | Grad Max: 0.385697 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.005956 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004130 | Grad Max: 0.004130 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000839 | Grad Max: 0.136934 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014187 | Grad Max: 0.763568 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.003350 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004356 | Grad Max: 0.043493 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000140 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000691 | Grad Max: 0.003536 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000078 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000183 | Grad Max: 0.000822 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000266 | Grad Max: 0.001125 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003261 | Grad Max: 0.003261 [GRADIENT NORM TOTAL] 2.5093 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.250 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081689 0.49183115] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.087 [MASKS] A(Pass/Fail): 724/1324 | B: 619/1237 | C: 651/1397 [LOSS Ex1] A: 0.62813 | B: 0.60941 | C: 0.59917 [LOGITS Ex2 A] Mean Abs: 2.366 | Max: 10.593 [LOSS Ex2] A: 0.09165 | B: 0.29743 | C: 0.20837 ** [JOINT LOSS] ** : 0.811388 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003958 | Grad Max: 0.126458 -> Layer: shared_layers.0.bias | Grad Mean: 0.252780 | Grad Max: 1.105037 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.005556 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000225 | Grad Max: 0.000225 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001721 | Grad Max: 0.334367 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031221 | Grad Max: 1.846927 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000146 | Grad Max: 0.005960 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013307 | Grad Max: 0.082038 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000276 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002627 | Grad Max: 0.006256 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000195 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000656 | Grad Max: 0.002135 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000357 | Grad Max: 0.001681 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008833 | Grad Max: 0.008833 [GRADIENT NORM TOTAL] 6.1480 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.178 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50562143 0.49437857] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.087 [MASKS] A(Pass/Fail): 720/1328 | B: 667/1381 | C: 633/1415 [LOSS Ex1] A: 0.62479 | B: 0.60935 | C: 0.60307 [LOGITS Ex2 A] Mean Abs: 2.335 | Max: 6.259 [LOSS Ex2] A: 0.09841 | B: 0.31000 | C: 0.21636 ** [JOINT LOSS] ** : 0.820662 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002191 | Grad Max: 0.086094 -> Layer: shared_layers.0.bias | Grad Mean: 0.143873 | Grad Max: 0.906196 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.005967 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000239 | Grad Max: 0.000239 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001076 | Grad Max: 0.447286 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019194 | Grad Max: 2.455468 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.003565 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003618 | Grad Max: 0.040545 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000149 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000686 | Grad Max: 0.003622 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000113 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000184 | Grad Max: 0.001235 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000249 | Grad Max: 0.001147 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003151 | Grad Max: 0.003151 [GRADIENT NORM TOTAL] 4.7771 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.214 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51026076 0.48973924] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.087 [MASKS] A(Pass/Fail): 722/1326 | B: 650/1398 | C: 581/1467 [LOSS Ex1] A: 0.62374 | B: 0.60969 | C: 0.60801 [LOGITS Ex2 A] Mean Abs: 2.308 | Max: 7.550 [LOSS Ex2] A: 0.11212 | B: 0.30703 | C: 0.19579 ** [JOINT LOSS] ** : 0.818793 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002389 | Grad Max: 0.049043 -> Layer: shared_layers.0.bias | Grad Mean: 0.110867 | Grad Max: 0.547526 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002162 | Grad Max: 0.006123 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001624 | Grad Max: 0.001624 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000945 | Grad Max: 0.344475 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016783 | Grad Max: 1.895428 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000040 | Grad Max: 0.002647 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002186 | Grad Max: 0.026465 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000157 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000400 | Grad Max: 0.002413 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000076 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000100 | Grad Max: 0.000627 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000324 | Grad Max: 0.001060 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001874 | Grad Max: 0.001874 [GRADIENT NORM TOTAL] 3.9053 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 1.006 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50065976 0.49934024] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 690/1358 | B: 663/1385 | C: 601/1447 [LOSS Ex1] A: 0.63271 | B: 0.60516 | C: 0.61105 [LOGITS Ex2 A] Mean Abs: 2.293 | Max: 6.597 [LOSS Ex2] A: 0.09957 | B: 0.28872 | C: 0.22077 ** [JOINT LOSS] ** : 0.819326 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003585 | Grad Max: 0.133432 -> Layer: shared_layers.0.bias | Grad Mean: 0.182276 | Grad Max: 0.858926 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.005091 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005192 | Grad Max: 0.005192 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001480 | Grad Max: 0.453397 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026759 | Grad Max: 2.531817 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000143 | Grad Max: 0.006456 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013265 | Grad Max: 0.073197 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000311 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002675 | Grad Max: 0.006453 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000168 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000660 | Grad Max: 0.001964 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000391 | Grad Max: 0.001632 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010340 | Grad Max: 0.010340 [GRADIENT NORM TOTAL] 5.0917 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.138 | Max: 0.900 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5423908 0.45760918] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.082 [MASKS] A(Pass/Fail): 693/1355 | B: 619/1237 | C: 643/1405 [LOSS Ex1] A: 0.63339 | B: 0.60920 | C: 0.59924 [LOGITS Ex2 A] Mean Abs: 2.342 | Max: 6.787 [LOSS Ex2] A: 0.09804 | B: 0.29184 | C: 0.22302 ** [JOINT LOSS] ** : 0.818240 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002976 | Grad Max: 0.150096 -> Layer: shared_layers.0.bias | Grad Mean: 0.342726 | Grad Max: 1.551452 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.005849 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009985 | Grad Max: 0.009985 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.641229 -> Layer: exit2_layers.0.bias | Grad Mean: 0.039529 | Grad Max: 3.560706 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000189 | Grad Max: 0.008927 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018609 | Grad Max: 0.108694 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000301 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003461 | Grad Max: 0.007809 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000221 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000843 | Grad Max: 0.003348 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000346 | Grad Max: 0.001772 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010637 | Grad Max: 0.010637 [GRADIENT NORM TOTAL] 8.4603 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.129 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8188314 0.18116868] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.087 [MASKS] A(Pass/Fail): 751/1297 | B: 667/1381 | C: 637/1411 [LOSS Ex1] A: 0.62676 | B: 0.60916 | C: 0.60256 [LOGITS Ex2 A] Mean Abs: 2.339 | Max: 7.543 [LOSS Ex2] A: 0.09541 | B: 0.30048 | C: 0.22518 ** [JOINT LOSS] ** : 0.819850 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004473 | Grad Max: 0.137099 -> Layer: shared_layers.0.bias | Grad Mean: 0.304428 | Grad Max: 1.590001 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.005974 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003925 | Grad Max: 0.003925 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001996 | Grad Max: 0.284864 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036865 | Grad Max: 1.585055 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000233 | Grad Max: 0.007143 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022327 | Grad Max: 0.107942 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000360 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004215 | Grad Max: 0.009684 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000256 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000990 | Grad Max: 0.003252 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000457 | Grad Max: 0.001827 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013148 | Grad Max: 0.013148 [GRADIENT NORM TOTAL] 6.4284 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.255 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004184 0.49958155] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.086 [MASKS] A(Pass/Fail): 724/1324 | B: 650/1398 | C: 420/956 [LOSS Ex1] A: 0.63400 | B: 0.60949 | C: 0.60297 [LOGITS Ex2 A] Mean Abs: 2.347 | Max: 7.354 [LOSS Ex2] A: 0.09163 | B: 0.30899 | C: 0.22874 ** [JOINT LOSS] ** : 0.825280 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006797 | Grad Max: 0.217317 -> Layer: shared_layers.0.bias | Grad Mean: 0.571175 | Grad Max: 2.851531 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.005451 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005886 | Grad Max: 0.005886 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003754 | Grad Max: 0.886042 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069341 | Grad Max: 4.910470 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.013805 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035112 | Grad Max: 0.183642 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000524 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006883 | Grad Max: 0.014041 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000348 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001675 | Grad Max: 0.005251 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000891 | Grad Max: 0.002447 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024805 | Grad Max: 0.024805 [GRADIENT NORM TOTAL] 13.3506 [EPOCH SUMMARY] Train Loss: 0.8189 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8066 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 168/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.886 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.74505603 0.25494394] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.087 [MASKS] A(Pass/Fail): 714/1334 | B: 663/1385 | C: 661/1387 [LOSS Ex1] A: 0.62905 | B: 0.60497 | C: 0.59624 [LOGITS Ex2 A] Mean Abs: 2.284 | Max: 7.355 [LOSS Ex2] A: 0.11288 | B: 0.29862 | C: 0.20390 ** [JOINT LOSS] ** : 0.815218 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005851 | Grad Max: 0.208375 -> Layer: shared_layers.0.bias | Grad Mean: 0.622774 | Grad Max: 2.841291 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002185 | Grad Max: 0.005804 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004282 | Grad Max: 0.004282 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003717 | Grad Max: 0.808064 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069611 | Grad Max: 4.480676 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.013668 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037405 | Grad Max: 0.205699 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000562 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007090 | Grad Max: 0.016260 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000365 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001679 | Grad Max: 0.005166 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000825 | Grad Max: 0.002489 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023594 | Grad Max: 0.023594 [GRADIENT NORM TOTAL] 13.3534 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.027 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64344144 0.3565585 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.087 [MASKS] A(Pass/Fail): 590/1026 | B: 619/1237 | C: 626/1422 [LOSS Ex1] A: 0.62716 | B: 0.60901 | C: 0.60712 [LOGITS Ex2 A] Mean Abs: 2.383 | Max: 9.593 [LOSS Ex2] A: 0.09469 | B: 0.28779 | C: 0.21080 ** [JOINT LOSS] ** : 0.812190 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003895 | Grad Max: 0.086758 -> Layer: shared_layers.0.bias | Grad Mean: 0.249495 | Grad Max: 1.045749 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002049 | Grad Max: 0.005714 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007930 | Grad Max: 0.007930 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001620 | Grad Max: 0.647319 -> Layer: exit2_layers.0.bias | Grad Mean: 0.029691 | Grad Max: 3.587425 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000162 | Grad Max: 0.006319 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015111 | Grad Max: 0.090946 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000307 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002927 | Grad Max: 0.008047 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000171 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000697 | Grad Max: 0.002229 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000397 | Grad Max: 0.001639 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010630 | Grad Max: 0.010630 [GRADIENT NORM TOTAL] 6.5535 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.257 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5081495 0.4918505] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 724/1324 | B: 667/1381 | C: 624/1424 [LOSS Ex1] A: 0.62790 | B: 0.60899 | C: 0.60305 [LOGITS Ex2 A] Mean Abs: 2.394 | Max: 9.426 [LOSS Ex2] A: 0.09029 | B: 0.32260 | C: 0.22754 ** [JOINT LOSS] ** : 0.826788 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010400 | Grad Max: 0.312958 -> Layer: shared_layers.0.bias | Grad Mean: 0.909919 | Grad Max: 4.137902 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002195 | Grad Max: 0.005422 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004835 | Grad Max: 0.004835 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006121 | Grad Max: 0.841210 -> Layer: exit2_layers.0.bias | Grad Mean: 0.113418 | Grad Max: 4.678674 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000641 | Grad Max: 0.019609 -> Layer: exit2_layers.3.bias | Grad Mean: 0.061906 | Grad Max: 0.326748 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000869 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012105 | Grad Max: 0.023650 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000624 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002912 | Grad Max: 0.009282 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001404 | Grad Max: 0.003630 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040670 | Grad Max: 0.040670 [GRADIENT NORM TOTAL] 19.8104 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.183 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5056449 0.49435502] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.087 [MASKS] A(Pass/Fail): 720/1328 | B: 650/1398 | C: 620/1428 [LOSS Ex1] A: 0.62455 | B: 0.60933 | C: 0.60521 [LOGITS Ex2 A] Mean Abs: 2.412 | Max: 6.095 [LOSS Ex2] A: 0.10952 | B: 0.33346 | C: 0.23266 ** [JOINT LOSS] ** : 0.838244 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011680 | Grad Max: 0.443724 -> Layer: shared_layers.0.bias | Grad Mean: 1.144838 | Grad Max: 5.899863 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.005639 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002615 | Grad Max: 0.002615 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007602 | Grad Max: 1.180987 -> Layer: exit2_layers.0.bias | Grad Mean: 0.141186 | Grad Max: 6.609797 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000764 | Grad Max: 0.024371 -> Layer: exit2_layers.3.bias | Grad Mean: 0.074551 | Grad Max: 0.384120 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001044 -> Layer: exit2_layers.6.bias | Grad Mean: 0.014483 | Grad Max: 0.028879 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000757 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003496 | Grad Max: 0.010978 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001645 | Grad Max: 0.003695 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048780 | Grad Max: 0.048780 [GRADIENT NORM TOTAL] 25.8887 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.219 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51034486 0.48965514] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.087 [MASKS] A(Pass/Fail): 723/1325 | B: 663/1385 | C: 623/1425 [LOSS Ex1] A: 0.62350 | B: 0.60481 | C: 0.60710 [LOGITS Ex2 A] Mean Abs: 2.358 | Max: 7.094 [LOSS Ex2] A: 0.11754 | B: 0.28419 | C: 0.23630 ** [JOINT LOSS] ** : 0.824480 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010667 | Grad Max: 0.284262 -> Layer: shared_layers.0.bias | Grad Mean: 0.683029 | Grad Max: 3.059067 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002217 | Grad Max: 0.005626 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002747 | Grad Max: 0.002747 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004705 | Grad Max: 0.584230 -> Layer: exit2_layers.0.bias | Grad Mean: 0.086043 | Grad Max: 3.294141 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000500 | Grad Max: 0.015212 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047902 | Grad Max: 0.226146 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000746 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009545 | Grad Max: 0.019975 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000487 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002310 | Grad Max: 0.007365 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001084 | Grad Max: 0.002713 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031268 | Grad Max: 0.031268 [GRADIENT NORM TOTAL] 15.0644 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.010 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50061744 0.49938253] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.086 [MASKS] A(Pass/Fail): 690/1358 | B: 619/1237 | C: 635/1413 [LOSS Ex1] A: 0.63249 | B: 0.60886 | C: 0.60014 [LOGITS Ex2 A] Mean Abs: 2.264 | Max: 6.128 [LOSS Ex2] A: 0.09362 | B: 0.29582 | C: 0.19464 ** [JOINT LOSS] ** : 0.808529 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003650 | Grad Max: 0.170498 -> Layer: shared_layers.0.bias | Grad Mean: 0.499427 | Grad Max: 2.437197 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002083 | Grad Max: 0.005724 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003556 | Grad Max: 0.003556 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003387 | Grad Max: 0.639325 -> Layer: exit2_layers.0.bias | Grad Mean: 0.062937 | Grad Max: 3.578642 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.011658 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035387 | Grad Max: 0.199403 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000464 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006668 | Grad Max: 0.014120 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000381 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001571 | Grad Max: 0.005670 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000740 | Grad Max: 0.002467 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021724 | Grad Max: 0.021724 [GRADIENT NORM TOTAL] 11.8035 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.904 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.542426 0.45757404] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.082 [MASKS] A(Pass/Fail): 693/1355 | B: 667/1381 | C: 590/1458 [LOSS Ex1] A: 0.63319 | B: 0.60886 | C: 0.60700 [LOGITS Ex2 A] Mean Abs: 2.230 | Max: 6.861 [LOSS Ex2] A: 0.10624 | B: 0.33184 | C: 0.21821 ** [JOINT LOSS] ** : 0.835114 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009472 | Grad Max: 0.300976 -> Layer: shared_layers.0.bias | Grad Mean: 0.868006 | Grad Max: 4.091258 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.005910 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011968 | Grad Max: 0.011968 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005751 | Grad Max: 0.834226 -> Layer: exit2_layers.0.bias | Grad Mean: 0.107164 | Grad Max: 4.706403 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000606 | Grad Max: 0.020790 -> Layer: exit2_layers.3.bias | Grad Mean: 0.059512 | Grad Max: 0.329571 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000801 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011487 | Grad Max: 0.022933 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000573 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002766 | Grad Max: 0.008339 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001314 | Grad Max: 0.003140 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039173 | Grad Max: 0.039173 [GRADIENT NORM TOTAL] 19.5581 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.134 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.819793 0.18020703] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.087 [MASKS] A(Pass/Fail): 751/1297 | B: 650/1398 | C: 654/1394 [LOSS Ex1] A: 0.62656 | B: 0.60920 | C: 0.59817 [LOGITS Ex2 A] Mean Abs: 2.289 | Max: 8.277 [LOSS Ex2] A: 0.08837 | B: 0.31896 | C: 0.20644 ** [JOINT LOSS] ** : 0.815902 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004179 | Grad Max: 0.168744 -> Layer: shared_layers.0.bias | Grad Mean: 0.466764 | Grad Max: 2.330323 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.005863 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003579 | Grad Max: 0.003579 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003140 | Grad Max: 0.578967 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057940 | Grad Max: 3.229973 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000321 | Grad Max: 0.009933 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031829 | Grad Max: 0.161595 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000459 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006280 | Grad Max: 0.013336 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000349 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001524 | Grad Max: 0.004956 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000741 | Grad Max: 0.002387 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022034 | Grad Max: 0.022034 [GRADIENT NORM TOTAL] 10.7028 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.260 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004349 0.49956506] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.086 [MASKS] A(Pass/Fail): 724/1324 | B: 663/1385 | C: 643/1405 [LOSS Ex1] A: 0.63382 | B: 0.60469 | C: 0.60556 [LOGITS Ex2 A] Mean Abs: 2.368 | Max: 6.481 [LOSS Ex2] A: 0.09469 | B: 0.28366 | C: 0.21516 ** [JOINT LOSS] ** : 0.812524 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004523 | Grad Max: 0.207978 -> Layer: shared_layers.0.bias | Grad Mean: 0.486040 | Grad Max: 2.702191 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.005315 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000234 | Grad Max: 0.000234 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003086 | Grad Max: 0.685378 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057051 | Grad Max: 3.822773 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.009758 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029524 | Grad Max: 0.160769 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000389 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005454 | Grad Max: 0.011505 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000276 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001336 | Grad Max: 0.004004 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000561 | Grad Max: 0.001956 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017831 | Grad Max: 0.017831 [GRADIENT NORM TOTAL] 11.0947 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.889 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7456844 0.25431558] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.087 [MASKS] A(Pass/Fail): 714/1334 | B: 619/1237 | C: 653/1395 [LOSS Ex1] A: 0.62886 | B: 0.60874 | C: 0.59874 [LOGITS Ex2 A] Mean Abs: 2.352 | Max: 7.731 [LOSS Ex2] A: 0.10915 | B: 0.30079 | C: 0.23099 ** [JOINT LOSS] ** : 0.825759 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005636 | Grad Max: 0.242662 -> Layer: shared_layers.0.bias | Grad Mean: 0.661550 | Grad Max: 3.143151 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002135 | Grad Max: 0.005601 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007476 | Grad Max: 0.007476 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004252 | Grad Max: 0.916355 -> Layer: exit2_layers.0.bias | Grad Mean: 0.079221 | Grad Max: 5.100037 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000449 | Grad Max: 0.014981 -> Layer: exit2_layers.3.bias | Grad Mean: 0.044275 | Grad Max: 0.228076 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000570 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008459 | Grad Max: 0.017375 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000451 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002059 | Grad Max: 0.007052 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000924 | Grad Max: 0.002464 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028323 | Grad Max: 0.028323 [GRADIENT NORM TOTAL] 14.8431 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.031 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6437635 0.35623655] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.087 [MASKS] A(Pass/Fail): 590/1026 | B: 667/1381 | C: 610/1438 [LOSS Ex1] A: 0.62698 | B: 0.60875 | C: 0.60593 [LOGITS Ex2 A] Mean Abs: 2.372 | Max: 9.242 [LOSS Ex2] A: 0.09110 | B: 0.29982 | C: 0.20724 ** [JOINT LOSS] ** : 0.813275 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003962 | Grad Max: 0.081903 -> Layer: shared_layers.0.bias | Grad Mean: 0.248595 | Grad Max: 0.939856 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.005515 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003663 | Grad Max: 0.003663 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001748 | Grad Max: 0.472832 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032105 | Grad Max: 2.614952 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.007469 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017933 | Grad Max: 0.108190 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000317 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003436 | Grad Max: 0.008613 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000190 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000846 | Grad Max: 0.002540 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000413 | Grad Max: 0.001711 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012072 | Grad Max: 0.012072 [GRADIENT NORM TOTAL] 5.7169 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.261 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50816494 0.49183503] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 724/1324 | B: 650/1398 | C: 664/1384 [LOSS Ex1] A: 0.62773 | B: 0.60909 | C: 0.60271 [LOGITS Ex2 A] Mean Abs: 2.288 | Max: 9.400 [LOSS Ex2] A: 0.09338 | B: 0.31226 | C: 0.20576 ** [JOINT LOSS] ** : 0.816980 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008207 | Grad Max: 0.271078 -> Layer: shared_layers.0.bias | Grad Mean: 0.745104 | Grad Max: 3.581179 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002097 | Grad Max: 0.005660 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002665 | Grad Max: 0.002665 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004624 | Grad Max: 0.638362 -> Layer: exit2_layers.0.bias | Grad Mean: 0.086013 | Grad Max: 3.525337 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000489 | Grad Max: 0.016872 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048066 | Grad Max: 0.281649 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000621 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009275 | Grad Max: 0.018398 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000484 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002241 | Grad Max: 0.007361 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001060 | Grad Max: 0.003042 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031536 | Grad Max: 0.031536 [GRADIENT NORM TOTAL] 16.0311 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.187 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50558144 0.49441853] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 720/1328 | B: 663/1385 | C: 610/1438 [LOSS Ex1] A: 0.62438 | B: 0.60458 | C: 0.60602 [LOGITS Ex2 A] Mean Abs: 2.251 | Max: 5.944 [LOSS Ex2] A: 0.10959 | B: 0.30556 | C: 0.20347 ** [JOINT LOSS] ** : 0.817869 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008310 | Grad Max: 0.312760 -> Layer: shared_layers.0.bias | Grad Mean: 0.915854 | Grad Max: 4.217233 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002184 | Grad Max: 0.005986 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000290 | Grad Max: 0.000290 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005703 | Grad Max: 0.719859 -> Layer: exit2_layers.0.bias | Grad Mean: 0.106510 | Grad Max: 3.991581 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000611 | Grad Max: 0.018721 -> Layer: exit2_layers.3.bias | Grad Mean: 0.060389 | Grad Max: 0.313958 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000828 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011608 | Grad Max: 0.023188 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000621 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002765 | Grad Max: 0.009101 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001281 | Grad Max: 0.003213 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038033 | Grad Max: 0.038033 [GRADIENT NORM TOTAL] 19.5138 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.223 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51042783 0.48957214] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.087 [MASKS] A(Pass/Fail): 724/1324 | B: 619/1237 | C: 412/964 [LOSS Ex1] A: 0.62334 | B: 0.60863 | C: 0.61089 [LOGITS Ex2 A] Mean Abs: 2.247 | Max: 6.802 [LOSS Ex2] A: 0.11140 | B: 0.29676 | C: 0.22656 ** [JOINT LOSS] ** : 0.825863 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004724 | Grad Max: 0.200558 -> Layer: shared_layers.0.bias | Grad Mean: 0.509727 | Grad Max: 2.818214 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002149 | Grad Max: 0.006264 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003332 | Grad Max: 0.003332 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003042 | Grad Max: 0.470019 -> Layer: exit2_layers.0.bias | Grad Mean: 0.056255 | Grad Max: 2.597116 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.010626 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029964 | Grad Max: 0.174075 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000466 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005530 | Grad Max: 0.013126 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000329 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001356 | Grad Max: 0.004428 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000661 | Grad Max: 0.002134 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019738 | Grad Max: 0.019738 [GRADIENT NORM TOTAL] 10.9879 [EPOCH SUMMARY] Train Loss: 0.8206 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8016 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 169/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.013 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500606 0.49939394] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.086 [MASKS] A(Pass/Fail): 690/1358 | B: 667/1381 | C: 633/1415 [LOSS Ex1] A: 0.63234 | B: 0.60865 | C: 0.60774 [LOGITS Ex2 A] Mean Abs: 2.285 | Max: 7.788 [LOSS Ex2] A: 0.09681 | B: 0.30810 | C: 0.23356 ** [JOINT LOSS] ** : 0.829062 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006319 | Grad Max: 0.154288 -> Layer: shared_layers.0.bias | Grad Mean: 0.469085 | Grad Max: 2.047938 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.005496 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007104 | Grad Max: 0.007104 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003096 | Grad Max: 0.417115 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057015 | Grad Max: 2.329649 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000321 | Grad Max: 0.009252 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031076 | Grad Max: 0.153647 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000491 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006084 | Grad Max: 0.013210 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000337 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001433 | Grad Max: 0.004704 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000635 | Grad Max: 0.001934 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018797 | Grad Max: 0.018797 [GRADIENT NORM TOTAL] 9.9886 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.907 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54231215 0.45768788] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.083 [MASKS] A(Pass/Fail): 693/1355 | B: 650/1398 | C: 620/1428 [LOSS Ex1] A: 0.63305 | B: 0.60900 | C: 0.60279 [LOGITS Ex2 A] Mean Abs: 2.280 | Max: 6.184 [LOSS Ex2] A: 0.10984 | B: 0.31058 | C: 0.20781 ** [JOINT LOSS] ** : 0.824354 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007775 | Grad Max: 0.308092 -> Layer: shared_layers.0.bias | Grad Mean: 0.818178 | Grad Max: 4.140939 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005503 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008216 | Grad Max: 0.008216 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005169 | Grad Max: 0.794147 -> Layer: exit2_layers.0.bias | Grad Mean: 0.095910 | Grad Max: 4.435971 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000524 | Grad Max: 0.017047 -> Layer: exit2_layers.3.bias | Grad Mean: 0.051980 | Grad Max: 0.264520 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000735 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010091 | Grad Max: 0.021459 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000490 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002431 | Grad Max: 0.007647 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001081 | Grad Max: 0.003065 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033203 | Grad Max: 0.033203 [GRADIENT NORM TOTAL] 17.9708 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.137 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.82045096 0.17954908] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.087 [MASKS] A(Pass/Fail): 751/1297 | B: 663/1385 | C: 649/1399 [LOSS Ex1] A: 0.62642 | B: 0.60449 | C: 0.60412 [LOGITS Ex2 A] Mean Abs: 2.281 | Max: 9.082 [LOSS Ex2] A: 0.09225 | B: 0.27822 | C: 0.20209 ** [JOINT LOSS] ** : 0.802530 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005375 | Grad Max: 0.161442 -> Layer: shared_layers.0.bias | Grad Mean: 0.383604 | Grad Max: 1.780985 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002206 | Grad Max: 0.005932 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006105 | Grad Max: 0.006105 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002535 | Grad Max: 0.363011 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045837 | Grad Max: 2.001706 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000250 | Grad Max: 0.008834 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024259 | Grad Max: 0.136094 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000395 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004898 | Grad Max: 0.010537 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000267 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001239 | Grad Max: 0.003955 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000558 | Grad Max: 0.002203 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017120 | Grad Max: 0.017120 [GRADIENT NORM TOTAL] 8.3517 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.263 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004848 0.49951515] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.086 [MASKS] A(Pass/Fail): 725/1323 | B: 620/1236 | C: 669/1379 [LOSS Ex1] A: 0.63369 | B: 0.60853 | C: 0.59585 [LOGITS Ex2 A] Mean Abs: 2.252 | Max: 7.508 [LOSS Ex2] A: 0.09017 | B: 0.30575 | C: 0.19775 ** [JOINT LOSS] ** : 0.810581 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008374 | Grad Max: 0.206446 -> Layer: shared_layers.0.bias | Grad Mean: 0.671774 | Grad Max: 3.003193 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.005675 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000186 | Grad Max: 0.000186 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004496 | Grad Max: 0.807122 -> Layer: exit2_layers.0.bias | Grad Mean: 0.083886 | Grad Max: 4.490779 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000485 | Grad Max: 0.014930 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047003 | Grad Max: 0.244991 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000696 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009071 | Grad Max: 0.018650 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000510 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002132 | Grad Max: 0.007321 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001026 | Grad Max: 0.003031 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029172 | Grad Max: 0.029172 [GRADIENT NORM TOTAL] 14.7660 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.892 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7461759 0.25382408] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.087 [MASKS] A(Pass/Fail): 713/1335 | B: 667/1381 | C: 612/1436 [LOSS Ex1] A: 0.62872 | B: 0.60856 | C: 0.60853 [LOGITS Ex2 A] Mean Abs: 2.192 | Max: 7.045 [LOSS Ex2] A: 0.11413 | B: 0.33829 | C: 0.23157 ** [JOINT LOSS] ** : 0.843266 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011787 | Grad Max: 0.324565 -> Layer: shared_layers.0.bias | Grad Mean: 0.922320 | Grad Max: 4.321826 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002024 | Grad Max: 0.005464 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009236 | Grad Max: 0.009236 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006294 | Grad Max: 1.018728 -> Layer: exit2_layers.0.bias | Grad Mean: 0.117124 | Grad Max: 5.636506 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000653 | Grad Max: 0.022415 -> Layer: exit2_layers.3.bias | Grad Mean: 0.063799 | Grad Max: 0.336156 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000871 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012513 | Grad Max: 0.024739 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000640 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003078 | Grad Max: 0.009554 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001536 | Grad Max: 0.003565 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044486 | Grad Max: 0.044486 [GRADIENT NORM TOTAL] 20.7903 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.034 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6439525 0.3560475] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.088 [MASKS] A(Pass/Fail): 590/1026 | B: 650/1398 | C: 632/1416 [LOSS Ex1] A: 0.62684 | B: 0.60891 | C: 0.60514 [LOGITS Ex2 A] Mean Abs: 2.261 | Max: 7.236 [LOSS Ex2] A: 0.09945 | B: 0.30863 | C: 0.23930 ** [JOINT LOSS] ** : 0.829423 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007403 | Grad Max: 0.177557 -> Layer: shared_layers.0.bias | Grad Mean: 0.546236 | Grad Max: 2.174809 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002157 | Grad Max: 0.006427 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013609 | Grad Max: 0.013609 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003616 | Grad Max: 0.929129 -> Layer: exit2_layers.0.bias | Grad Mean: 0.067045 | Grad Max: 5.160577 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000418 | Grad Max: 0.013327 -> Layer: exit2_layers.3.bias | Grad Mean: 0.040483 | Grad Max: 0.223030 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000570 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007982 | Grad Max: 0.015324 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000419 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001991 | Grad Max: 0.005948 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001023 | Grad Max: 0.002813 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029583 | Grad Max: 0.029583 [GRADIENT NORM TOTAL] 12.3382 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.264 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50818455 0.49181545] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 724/1324 | B: 663/1385 | C: 631/1417 [LOSS Ex1] A: 0.62759 | B: 0.60441 | C: 0.60272 [LOGITS Ex2 A] Mean Abs: 2.324 | Max: 7.268 [LOSS Ex2] A: 0.08583 | B: 0.29242 | C: 0.21780 ** [JOINT LOSS] ** : 0.810258 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005835 | Grad Max: 0.304969 -> Layer: shared_layers.0.bias | Grad Mean: 0.705495 | Grad Max: 4.137354 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.005682 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001760 | Grad Max: 0.001760 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004375 | Grad Max: 0.687680 -> Layer: exit2_layers.0.bias | Grad Mean: 0.081342 | Grad Max: 3.801893 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000429 | Grad Max: 0.013369 -> Layer: exit2_layers.3.bias | Grad Mean: 0.043184 | Grad Max: 0.218275 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000578 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008313 | Grad Max: 0.017177 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000396 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002020 | Grad Max: 0.005733 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000831 | Grad Max: 0.002801 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027085 | Grad Max: 0.027085 [GRADIENT NORM TOTAL] 15.9979 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.189 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5054401 0.49455982] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 720/1328 | B: 620/1236 | C: 653/1395 [LOSS Ex1] A: 0.62425 | B: 0.60845 | C: 0.59974 [LOGITS Ex2 A] Mean Abs: 2.342 | Max: 8.112 [LOSS Ex2] A: 0.11209 | B: 0.31312 | C: 0.23254 ** [JOINT LOSS] ** : 0.830065 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010043 | Grad Max: 0.403141 -> Layer: shared_layers.0.bias | Grad Mean: 1.037346 | Grad Max: 5.367427 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002228 | Grad Max: 0.006371 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003261 | Grad Max: 0.003261 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006606 | Grad Max: 0.964795 -> Layer: exit2_layers.0.bias | Grad Mean: 0.123083 | Grad Max: 5.326637 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000684 | Grad Max: 0.023427 -> Layer: exit2_layers.3.bias | Grad Mean: 0.067982 | Grad Max: 0.378365 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.000865 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013199 | Grad Max: 0.026121 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000661 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003246 | Grad Max: 0.010234 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001473 | Grad Max: 0.003327 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044933 | Grad Max: 0.044933 [GRADIENT NORM TOTAL] 22.9372 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.225 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5105302 0.4894698] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.087 [MASKS] A(Pass/Fail): 723/1325 | B: 667/1381 | C: 637/1411 [LOSS Ex1] A: 0.62321 | B: 0.60849 | C: 0.60586 [LOGITS Ex2 A] Mean Abs: 2.295 | Max: 7.286 [LOSS Ex2] A: 0.10958 | B: 0.31550 | C: 0.23599 ** [JOINT LOSS] ** : 0.832877 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007754 | Grad Max: 0.289705 -> Layer: shared_layers.0.bias | Grad Mean: 0.780934 | Grad Max: 4.050404 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002141 | Grad Max: 0.006061 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001791 | Grad Max: 0.001791 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005041 | Grad Max: 0.774583 -> Layer: exit2_layers.0.bias | Grad Mean: 0.093456 | Grad Max: 4.306075 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000532 | Grad Max: 0.017860 -> Layer: exit2_layers.3.bias | Grad Mean: 0.052942 | Grad Max: 0.284025 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000071 | Grad Max: 0.000693 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010276 | Grad Max: 0.020700 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000481 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002417 | Grad Max: 0.007714 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001062 | Grad Max: 0.002810 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031970 | Grad Max: 0.031970 [GRADIENT NORM TOTAL] 17.4558 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.015 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006144 0.49938563] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.086 [MASKS] A(Pass/Fail): 689/1359 | B: 650/1398 | C: 631/1417 [LOSS Ex1] A: 0.63221 | B: 0.60884 | C: 0.59844 [LOGITS Ex2 A] Mean Abs: 2.221 | Max: 6.981 [LOSS Ex2] A: 0.09588 | B: 0.30451 | C: 0.20467 ** [JOINT LOSS] ** : 0.814853 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002597 | Grad Max: 0.081854 -> Layer: shared_layers.0.bias | Grad Mean: 0.244519 | Grad Max: 1.197058 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.005711 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008681 | Grad Max: 0.008681 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001523 | Grad Max: 0.681634 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027652 | Grad Max: 3.788115 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000111 | Grad Max: 0.005542 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010730 | Grad Max: 0.074680 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000238 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002075 | Grad Max: 0.005545 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000158 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000475 | Grad Max: 0.002089 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000379 | Grad Max: 0.001718 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007102 | Grad Max: 0.007102 [GRADIENT NORM TOTAL] 7.2126 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.909 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54224277 0.45775723] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.083 [MASKS] A(Pass/Fail): 693/1355 | B: 663/1385 | C: 627/1421 [LOSS Ex1] A: 0.63293 | B: 0.60434 | C: 0.60359 [LOGITS Ex2 A] Mean Abs: 2.182 | Max: 6.290 [LOSS Ex2] A: 0.11644 | B: 0.29790 | C: 0.19456 ** [JOINT LOSS] ** : 0.816586 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006606 | Grad Max: 0.217319 -> Layer: shared_layers.0.bias | Grad Mean: 0.587575 | Grad Max: 2.946973 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.005235 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003682 | Grad Max: 0.003682 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003935 | Grad Max: 0.664878 -> Layer: exit2_layers.0.bias | Grad Mean: 0.073101 | Grad Max: 3.729229 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000375 | Grad Max: 0.012620 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037124 | Grad Max: 0.198991 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000538 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007207 | Grad Max: 0.015318 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000441 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001715 | Grad Max: 0.006271 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000776 | Grad Max: 0.002552 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023053 | Grad Max: 0.023053 [GRADIENT NORM TOTAL] 13.5601 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.140 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8208916 0.1791083] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.087 [MASKS] A(Pass/Fail): 750/1298 | B: 620/1236 | C: 622/1426 [LOSS Ex1] A: 0.62631 | B: 0.60838 | C: 0.60325 [LOGITS Ex2 A] Mean Abs: 2.243 | Max: 6.657 [LOSS Ex2] A: 0.08730 | B: 0.29662 | C: 0.21957 ** [JOINT LOSS] ** : 0.813812 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003363 | Grad Max: 0.193802 -> Layer: shared_layers.0.bias | Grad Mean: 0.475557 | Grad Max: 2.492187 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.005193 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002084 | Grad Max: 0.002084 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003116 | Grad Max: 0.363835 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058099 | Grad Max: 2.053047 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000313 | Grad Max: 0.010879 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031400 | Grad Max: 0.167021 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000448 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006025 | Grad Max: 0.012564 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000372 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001443 | Grad Max: 0.005393 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000656 | Grad Max: 0.002311 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019855 | Grad Max: 0.019855 [GRADIENT NORM TOTAL] 10.7189 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.266 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005275 0.4994725] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.086 [MASKS] A(Pass/Fail): 725/1323 | B: 667/1381 | C: 639/1409 [LOSS Ex1] A: 0.63357 | B: 0.60842 | C: 0.60217 [LOGITS Ex2 A] Mean Abs: 2.292 | Max: 7.176 [LOSS Ex2] A: 0.09235 | B: 0.30931 | C: 0.21520 ** [JOINT LOSS] ** : 0.820341 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004140 | Grad Max: 0.193104 -> Layer: shared_layers.0.bias | Grad Mean: 0.366083 | Grad Max: 2.231292 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.006318 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005710 | Grad Max: 0.005710 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002296 | Grad Max: 0.537838 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042401 | Grad Max: 2.999084 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000236 | Grad Max: 0.009156 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023190 | Grad Max: 0.130984 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000362 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004529 | Grad Max: 0.009887 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000218 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001120 | Grad Max: 0.003387 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000453 | Grad Max: 0.001978 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014482 | Grad Max: 0.014482 [GRADIENT NORM TOTAL] 8.5070 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.894 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7463964 0.25360358] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.087 [MASKS] A(Pass/Fail): 713/1335 | B: 650/1398 | C: 412/964 [LOSS Ex1] A: 0.62860 | B: 0.60876 | C: 0.60860 [LOGITS Ex2 A] Mean Abs: 2.271 | Max: 6.104 [LOSS Ex2] A: 0.10731 | B: 0.31244 | C: 0.20876 ** [JOINT LOSS] ** : 0.824825 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005685 | Grad Max: 0.232836 -> Layer: shared_layers.0.bias | Grad Mean: 0.568788 | Grad Max: 3.028514 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002052 | Grad Max: 0.005115 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000046 | Grad Max: 0.000046 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003658 | Grad Max: 0.819113 -> Layer: exit2_layers.0.bias | Grad Mean: 0.067476 | Grad Max: 4.576969 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000370 | Grad Max: 0.011554 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036482 | Grad Max: 0.188060 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000464 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007167 | Grad Max: 0.014685 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000375 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001772 | Grad Max: 0.005700 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000810 | Grad Max: 0.002472 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024804 | Grad Max: 0.024804 [GRADIENT NORM TOTAL] 13.1534 [EPOCH SUMMARY] Train Loss: 0.8216 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.7990 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.8008 -> New: 0.7990) ############################## EPOCH 170/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.037 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6440113 0.35598868] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.088 [MASKS] A(Pass/Fail): 590/1026 | B: 663/1385 | C: 664/1384 [LOSS Ex1] A: 0.62672 | B: 0.60426 | C: 0.59545 [LOGITS Ex2 A] Mean Abs: 2.285 | Max: 12.082 [LOSS Ex2] A: 0.09960 | B: 0.28249 | C: 0.21549 ** [JOINT LOSS] ** : 0.808004 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002403 | Grad Max: 0.060595 -> Layer: shared_layers.0.bias | Grad Mean: 0.088958 | Grad Max: 0.386877 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002242 | Grad Max: 0.006309 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009027 | Grad Max: 0.009027 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000716 | Grad Max: 0.349422 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012279 | Grad Max: 1.932204 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000048 | Grad Max: 0.002985 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003685 | Grad Max: 0.036730 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000141 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000728 | Grad Max: 0.002988 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000084 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000208 | Grad Max: 0.001066 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000336 | Grad Max: 0.001388 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004581 | Grad Max: 0.004581 [GRADIENT NORM TOTAL] 3.2603 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.267 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082134 0.49178657] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 725/1323 | B: 620/1236 | C: 611/1437 [LOSS Ex1] A: 0.62748 | B: 0.60829 | C: 0.60536 [LOGITS Ex2 A] Mean Abs: 2.268 | Max: 10.262 [LOSS Ex2] A: 0.09512 | B: 0.28910 | C: 0.21295 ** [JOINT LOSS] ** : 0.812767 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003111 | Grad Max: 0.112634 -> Layer: shared_layers.0.bias | Grad Mean: 0.271123 | Grad Max: 1.133628 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002108 | Grad Max: 0.005424 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003920 | Grad Max: 0.003920 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001730 | Grad Max: 0.211193 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031696 | Grad Max: 1.165500 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000184 | Grad Max: 0.007121 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018263 | Grad Max: 0.111806 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000302 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003491 | Grad Max: 0.008170 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000194 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000813 | Grad Max: 0.002940 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000411 | Grad Max: 0.001830 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011048 | Grad Max: 0.011048 [GRADIENT NORM TOTAL] 5.6174 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.192 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5053226 0.49467742] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 720/1328 | B: 668/1380 | C: 629/1419 [LOSS Ex1] A: 0.62414 | B: 0.60832 | C: 0.61101 [LOGITS Ex2 A] Mean Abs: 2.286 | Max: 5.407 [LOSS Ex2] A: 0.09620 | B: 0.30875 | C: 0.23619 ** [JOINT LOSS] ** : 0.828207 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003965 | Grad Max: 0.093667 -> Layer: shared_layers.0.bias | Grad Mean: 0.272304 | Grad Max: 1.341745 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.005930 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000242 | Grad Max: 0.000242 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001868 | Grad Max: 0.250564 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034308 | Grad Max: 1.410381 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000170 | Grad Max: 0.006546 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016175 | Grad Max: 0.096514 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000334 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003131 | Grad Max: 0.008215 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000204 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000707 | Grad Max: 0.002880 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000305 | Grad Max: 0.001392 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007908 | Grad Max: 0.007908 [GRADIENT NORM TOTAL] 6.3276 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.228 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5106332 0.4893668] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.087 [MASKS] A(Pass/Fail): 723/1325 | B: 650/1398 | C: 623/1425 [LOSS Ex1] A: 0.62309 | B: 0.60866 | C: 0.60449 [LOGITS Ex2 A] Mean Abs: 2.236 | Max: 6.615 [LOSS Ex2] A: 0.10849 | B: 0.30609 | C: 0.20847 ** [JOINT LOSS] ** : 0.819762 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003826 | Grad Max: 0.095459 -> Layer: shared_layers.0.bias | Grad Mean: 0.273395 | Grad Max: 1.291165 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.006497 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001422 | Grad Max: 0.001422 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001907 | Grad Max: 0.310626 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034639 | Grad Max: 1.728630 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000171 | Grad Max: 0.005694 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016519 | Grad Max: 0.080058 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000329 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003275 | Grad Max: 0.008145 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000228 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000801 | Grad Max: 0.003120 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000414 | Grad Max: 0.001550 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010056 | Grad Max: 0.010056 [GRADIENT NORM TOTAL] 6.5568 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 1.018 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50064445 0.49935552] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.086 [MASKS] A(Pass/Fail): 690/1358 | B: 663/1385 | C: 664/1384 [LOSS Ex1] A: 0.63207 | B: 0.60415 | C: 0.59864 [LOGITS Ex2 A] Mean Abs: 2.189 | Max: 6.019 [LOSS Ex2] A: 0.08846 | B: 0.28378 | C: 0.19041 ** [JOINT LOSS] ** : 0.799171 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005138 | Grad Max: 0.152186 -> Layer: shared_layers.0.bias | Grad Mean: 0.422047 | Grad Max: 2.079322 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.006306 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011225 | Grad Max: 0.011225 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002874 | Grad Max: 0.623386 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052891 | Grad Max: 3.495676 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000291 | Grad Max: 0.011409 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028616 | Grad Max: 0.173913 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000436 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005605 | Grad Max: 0.012103 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000357 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001334 | Grad Max: 0.004924 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000614 | Grad Max: 0.002520 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017820 | Grad Max: 0.017820 [GRADIENT NORM TOTAL] 10.0161 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.911 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.542105 0.45789495] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.083 [MASKS] A(Pass/Fail): 693/1355 | B: 620/1236 | C: 653/1395 [LOSS Ex1] A: 0.63279 | B: 0.60819 | C: 0.59786 [LOGITS Ex2 A] Mean Abs: 2.167 | Max: 6.495 [LOSS Ex2] A: 0.10707 | B: 0.30163 | C: 0.22103 ** [JOINT LOSS] ** : 0.822856 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007071 | Grad Max: 0.193284 -> Layer: shared_layers.0.bias | Grad Mean: 0.476618 | Grad Max: 2.378479 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002105 | Grad Max: 0.005183 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006809 | Grad Max: 0.006809 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003177 | Grad Max: 0.564853 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058820 | Grad Max: 3.168945 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000338 | Grad Max: 0.010087 -> Layer: exit2_layers.3.bias | Grad Mean: 0.032789 | Grad Max: 0.153971 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000536 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006434 | Grad Max: 0.013527 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000366 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001540 | Grad Max: 0.005100 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000763 | Grad Max: 0.002390 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021331 | Grad Max: 0.021331 [GRADIENT NORM TOTAL] 10.5030 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.144 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8213929 0.17860706] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.088 [MASKS] A(Pass/Fail): 750/1298 | B: 668/1380 | C: 599/1449 [LOSS Ex1] A: 0.62617 | B: 0.60822 | C: 0.60238 [LOGITS Ex2 A] Mean Abs: 2.261 | Max: 6.971 [LOSS Ex2] A: 0.09830 | B: 0.30763 | C: 0.20528 ** [JOINT LOSS] ** : 0.815991 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004241 | Grad Max: 0.136017 -> Layer: shared_layers.0.bias | Grad Mean: 0.273247 | Grad Max: 1.266063 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002262 | Grad Max: 0.005887 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010002 | Grad Max: 0.010002 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001937 | Grad Max: 0.380881 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035662 | Grad Max: 2.115374 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.006317 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018435 | Grad Max: 0.099078 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000330 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003711 | Grad Max: 0.008046 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000221 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000961 | Grad Max: 0.003211 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000450 | Grad Max: 0.001779 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013103 | Grad Max: 0.013103 [GRADIENT NORM TOTAL] 6.6631 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.270 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50062567 0.49937436] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.086 [MASKS] A(Pass/Fail): 725/1323 | B: 650/1398 | C: 618/1430 [LOSS Ex1] A: 0.63342 | B: 0.60855 | C: 0.60436 [LOGITS Ex2 A] Mean Abs: 2.280 | Max: 6.655 [LOSS Ex2] A: 0.08870 | B: 0.30836 | C: 0.19570 ** [JOINT LOSS] ** : 0.813036 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002927 | Grad Max: 0.149127 -> Layer: shared_layers.0.bias | Grad Mean: 0.308411 | Grad Max: 2.007177 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.005453 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001094 | Grad Max: 0.001094 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002180 | Grad Max: 0.325823 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040019 | Grad Max: 1.804268 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000187 | Grad Max: 0.007431 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018363 | Grad Max: 0.114990 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000269 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003329 | Grad Max: 0.007356 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000162 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000804 | Grad Max: 0.002239 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000477 | Grad Max: 0.001636 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010553 | Grad Max: 0.010553 [GRADIENT NORM TOTAL] 7.6352 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 0.896 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7468111 0.25318885] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.088 [MASKS] A(Pass/Fail): 713/1335 | B: 663/1385 | C: 661/1387 [LOSS Ex1] A: 0.62844 | B: 0.60404 | C: 0.59933 [LOGITS Ex2 A] Mean Abs: 2.225 | Max: 6.819 [LOSS Ex2] A: 0.10942 | B: 0.28749 | C: 0.19264 ** [JOINT LOSS] ** : 0.807119 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003089 | Grad Max: 0.079800 -> Layer: shared_layers.0.bias | Grad Mean: 0.215075 | Grad Max: 1.168709 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.005484 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002074 | Grad Max: 0.002074 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001169 | Grad Max: 0.658572 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020621 | Grad Max: 3.657030 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000091 | Grad Max: 0.004709 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008391 | Grad Max: 0.068943 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000012 | Grad Max: 0.000202 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001599 | Grad Max: 0.004620 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000137 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000348 | Grad Max: 0.001940 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000392 | Grad Max: 0.001589 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004336 | Grad Max: 0.004336 [GRADIENT NORM TOTAL] 5.7597 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.040 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64418465 0.35581535] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.088 [MASKS] A(Pass/Fail): 590/1026 | B: 620/1236 | C: 635/1413 [LOSS Ex1] A: 0.62655 | B: 0.60807 | C: 0.60797 [LOGITS Ex2 A] Mean Abs: 2.264 | Max: 11.798 [LOSS Ex2] A: 0.09742 | B: 0.29243 | C: 0.20975 ** [JOINT LOSS] ** : 0.814066 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003731 | Grad Max: 0.133300 -> Layer: shared_layers.0.bias | Grad Mean: 0.395664 | Grad Max: 1.863499 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.005522 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002207 | Grad Max: 0.002207 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002376 | Grad Max: 0.748567 -> Layer: exit2_layers.0.bias | Grad Mean: 0.043880 | Grad Max: 4.148497 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000240 | Grad Max: 0.007482 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024024 | Grad Max: 0.121546 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000413 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004639 | Grad Max: 0.010448 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000266 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001115 | Grad Max: 0.004005 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000538 | Grad Max: 0.002058 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015758 | Grad Max: 0.015758 [GRADIENT NORM TOTAL] 9.1656 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.272 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082613 0.4917387] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 725/1323 | B: 668/1380 | C: 634/1414 [LOSS Ex1] A: 0.62731 | B: 0.60811 | C: 0.60496 [LOGITS Ex2 A] Mean Abs: 2.292 | Max: 10.014 [LOSS Ex2] A: 0.08766 | B: 0.31266 | C: 0.22497 ** [JOINT LOSS] ** : 0.821891 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002312 | Grad Max: 0.095213 -> Layer: shared_layers.0.bias | Grad Mean: 0.150131 | Grad Max: 1.252135 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.005258 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002382 | Grad Max: 0.002382 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001166 | Grad Max: 0.324423 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020922 | Grad Max: 1.806336 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000088 | Grad Max: 0.004164 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007942 | Grad Max: 0.062977 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000217 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001391 | Grad Max: 0.004855 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000120 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000314 | Grad Max: 0.001488 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000349 | Grad Max: 0.001111 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003178 | Grad Max: 0.003178 [GRADIENT NORM TOTAL] 4.5573 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.196 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5052098 0.4947902] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 720/1328 | B: 650/1398 | C: 636/1412 [LOSS Ex1] A: 0.62397 | B: 0.60843 | C: 0.60588 [LOGITS Ex2 A] Mean Abs: 2.296 | Max: 8.578 [LOSS Ex2] A: 0.10009 | B: 0.30331 | C: 0.21182 ** [JOINT LOSS] ** : 0.817835 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003305 | Grad Max: 0.075037 -> Layer: shared_layers.0.bias | Grad Mean: 0.210236 | Grad Max: 1.103128 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002114 | Grad Max: 0.006092 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000798 | Grad Max: 0.000798 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001311 | Grad Max: 0.339066 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023667 | Grad Max: 1.887164 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000098 | Grad Max: 0.004786 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009376 | Grad Max: 0.060975 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000234 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001847 | Grad Max: 0.005352 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000166 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000414 | Grad Max: 0.002190 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000330 | Grad Max: 0.001067 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004232 | Grad Max: 0.004232 [GRADIENT NORM TOTAL] 5.2590 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.233 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5107318 0.48926815] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.566 | Std: 0.087 [MASKS] A(Pass/Fail): 723/1325 | B: 664/1384 | C: 630/1418 [LOSS Ex1] A: 0.62292 | B: 0.60391 | C: 0.60640 [LOGITS Ex2 A] Mean Abs: 2.244 | Max: 7.036 [LOSS Ex2] A: 0.10812 | B: 0.28411 | C: 0.21968 ** [JOINT LOSS] ** : 0.815050 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003683 | Grad Max: 0.124169 -> Layer: shared_layers.0.bias | Grad Mean: 0.206025 | Grad Max: 1.340376 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.005917 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007475 | Grad Max: 0.007475 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001388 | Grad Max: 0.191351 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023807 | Grad Max: 1.054265 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000134 | Grad Max: 0.006126 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012424 | Grad Max: 0.077958 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000218 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002150 | Grad Max: 0.005745 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000151 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000554 | Grad Max: 0.001922 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000325 | Grad Max: 0.001485 -> Layer: exit2_layers.12.bias | Grad Mean: 0.008181 | Grad Max: 0.008181 [GRADIENT NORM TOTAL] 4.3625 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.021 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006232 0.49937674] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.563 | Std: 0.086 [MASKS] A(Pass/Fail): 691/1357 | B: 620/1236 | C: 439/937 [LOSS Ex1] A: 0.63191 | B: 0.60795 | C: 0.59653 [LOGITS Ex2 A] Mean Abs: 2.245 | Max: 5.978 [LOSS Ex2] A: 0.08788 | B: 0.28156 | C: 0.19016 ** [JOINT LOSS] ** : 0.798660 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002878 | Grad Max: 0.097087 -> Layer: shared_layers.0.bias | Grad Mean: 0.125730 | Grad Max: 0.844214 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005677 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003209 | Grad Max: 0.003209 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000915 | Grad Max: 0.306265 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015470 | Grad Max: 1.707743 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000050 | Grad Max: 0.003442 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002941 | Grad Max: 0.039180 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000142 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000492 | Grad Max: 0.002449 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000120 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000179 | Grad Max: 0.001491 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000503 | Grad Max: 0.001474 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002246 | Grad Max: 0.002246 [GRADIENT NORM TOTAL] 3.7375 [EPOCH SUMMARY] Train Loss: 0.8139 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.7985 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.7990 -> New: 0.7985) ############################## EPOCH 171/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.139 | Max: 0.914 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5421049 0.45789507] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.083 [MASKS] A(Pass/Fail): 693/1355 | B: 668/1380 | C: 653/1395 [LOSS Ex1] A: 0.63263 | B: 0.60798 | C: 0.60139 [LOGITS Ex2 A] Mean Abs: 2.241 | Max: 6.678 [LOSS Ex2] A: 0.10236 | B: 0.30862 | C: 0.20202 ** [JOINT LOSS] ** : 0.818335 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003188 | Grad Max: 0.103001 -> Layer: shared_layers.0.bias | Grad Mean: 0.094367 | Grad Max: 0.364185 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.005377 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003095 | Grad Max: 0.003095 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001005 | Grad Max: 0.453149 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017189 | Grad Max: 2.544601 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000066 | Grad Max: 0.004491 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005221 | Grad Max: 0.056732 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000160 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000846 | Grad Max: 0.003357 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000080 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000203 | Grad Max: 0.001007 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000338 | Grad Max: 0.001040 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002353 | Grad Max: 0.002353 [GRADIENT NORM TOTAL] 4.0281 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.149 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.82225156 0.17774843] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.088 [MASKS] A(Pass/Fail): 750/1298 | B: 650/1398 | C: 594/1454 [LOSS Ex1] A: 0.62599 | B: 0.60829 | C: 0.60481 [LOGITS Ex2 A] Mean Abs: 2.281 | Max: 7.284 [LOSS Ex2] A: 0.09363 | B: 0.29899 | C: 0.19865 ** [JOINT LOSS] ** : 0.810119 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001829 | Grad Max: 0.047382 -> Layer: shared_layers.0.bias | Grad Mean: 0.079020 | Grad Max: 0.675747 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.005340 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002674 | Grad Max: 0.002674 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000667 | Grad Max: 0.173515 -> Layer: exit2_layers.0.bias | Grad Mean: 0.011493 | Grad Max: 0.967191 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.002482 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002916 | Grad Max: 0.025657 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000162 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000601 | Grad Max: 0.002650 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000083 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000143 | Grad Max: 0.000769 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000397 | Grad Max: 0.001219 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001860 | Grad Max: 0.001860 [GRADIENT NORM TOTAL] 2.4903 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.277 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006153 0.4993847] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.086 [MASKS] A(Pass/Fail): 725/1323 | B: 665/1383 | C: 639/1409 [LOSS Ex1] A: 0.63325 | B: 0.60375 | C: 0.60142 [LOGITS Ex2 A] Mean Abs: 2.319 | Max: 7.261 [LOSS Ex2] A: 0.08404 | B: 0.28121 | C: 0.20843 ** [JOINT LOSS] ** : 0.804030 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002718 | Grad Max: 0.060754 -> Layer: shared_layers.0.bias | Grad Mean: 0.138705 | Grad Max: 0.681695 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.005223 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001963 | Grad Max: 0.001963 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000900 | Grad Max: 0.561943 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016127 | Grad Max: 3.116124 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000062 | Grad Max: 0.002040 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005227 | Grad Max: 0.028963 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000213 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000955 | Grad Max: 0.003617 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000099 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000217 | Grad Max: 0.001079 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000432 | Grad Max: 0.001130 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002059 | Grad Max: 0.002059 [GRADIENT NORM TOTAL] 4.6548 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.900 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.74756616 0.25243384] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.088 [MASKS] A(Pass/Fail): 713/1335 | B: 621/1235 | C: 631/1417 [LOSS Ex1] A: 0.62824 | B: 0.60777 | C: 0.60703 [LOGITS Ex2 A] Mean Abs: 2.286 | Max: 6.639 [LOSS Ex2] A: 0.10638 | B: 0.28649 | C: 0.20204 ** [JOINT LOSS] ** : 0.812652 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003742 | Grad Max: 0.086888 -> Layer: shared_layers.0.bias | Grad Mean: 0.223360 | Grad Max: 1.148190 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.005298 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003145 | Grad Max: 0.003145 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001403 | Grad Max: 0.589452 -> Layer: exit2_layers.0.bias | Grad Mean: 0.024743 | Grad Max: 3.266058 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000100 | Grad Max: 0.004282 -> Layer: exit2_layers.3.bias | Grad Mean: 0.008784 | Grad Max: 0.051532 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000168 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001351 | Grad Max: 0.004459 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000111 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000299 | Grad Max: 0.001298 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000326 | Grad Max: 0.001170 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003465 | Grad Max: 0.003465 [GRADIENT NORM TOTAL] 5.8410 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.045 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6446194 0.35538054] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.088 [MASKS] A(Pass/Fail): 590/1026 | B: 669/1379 | C: 653/1395 [LOSS Ex1] A: 0.62634 | B: 0.60780 | C: 0.60351 [LOGITS Ex2 A] Mean Abs: 2.344 | Max: 10.711 [LOSS Ex2] A: 0.09524 | B: 0.29986 | C: 0.20623 ** [JOINT LOSS] ** : 0.812994 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005853 | Grad Max: 0.144992 -> Layer: shared_layers.0.bias | Grad Mean: 0.251193 | Grad Max: 1.667842 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002070 | Grad Max: 0.005910 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003488 | Grad Max: 0.003488 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001721 | Grad Max: 0.285859 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031063 | Grad Max: 1.589072 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000174 | Grad Max: 0.005875 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015981 | Grad Max: 0.075735 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000384 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003293 | Grad Max: 0.007997 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000195 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000771 | Grad Max: 0.002544 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000381 | Grad Max: 0.001634 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009795 | Grad Max: 0.009795 [GRADIENT NORM TOTAL] 5.6695 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.279 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50827724 0.49172276] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.089 [MASKS] A(Pass/Fail): 727/1321 | B: 650/1398 | C: 639/1409 [LOSS Ex1] A: 0.62709 | B: 0.60811 | C: 0.60183 [LOGITS Ex2 A] Mean Abs: 2.311 | Max: 9.693 [LOSS Ex2] A: 0.09128 | B: 0.30001 | C: 0.20887 ** [JOINT LOSS] ** : 0.812394 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002780 | Grad Max: 0.061787 -> Layer: shared_layers.0.bias | Grad Mean: 0.165926 | Grad Max: 0.757587 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002118 | Grad Max: 0.005281 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000471 | Grad Max: 0.000471 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001329 | Grad Max: 0.177168 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023965 | Grad Max: 0.986357 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.005903 -> Layer: exit2_layers.3.bias | Grad Mean: 0.010364 | Grad Max: 0.068414 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000223 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001865 | Grad Max: 0.004912 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000139 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000447 | Grad Max: 0.001498 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000316 | Grad Max: 0.001372 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007083 | Grad Max: 0.007083 [GRADIENT NORM TOTAL] 4.2106 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.202 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5051736 0.4948264] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 720/1328 | B: 665/1383 | C: 633/1415 [LOSS Ex1] A: 0.62374 | B: 0.60357 | C: 0.60260 [LOGITS Ex2 A] Mean Abs: 2.314 | Max: 6.760 [LOSS Ex2] A: 0.09689 | B: 0.28694 | C: 0.22522 ** [JOINT LOSS] ** : 0.812984 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002081 | Grad Max: 0.086695 -> Layer: shared_layers.0.bias | Grad Mean: 0.100160 | Grad Max: 0.510429 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.006238 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005955 | Grad Max: 0.005955 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000753 | Grad Max: 0.210069 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013305 | Grad Max: 1.160851 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000057 | Grad Max: 0.002793 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004856 | Grad Max: 0.028721 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000158 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000934 | Grad Max: 0.003427 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000097 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000274 | Grad Max: 0.001438 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000236 | Grad Max: 0.001061 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004855 | Grad Max: 0.004855 [GRADIENT NORM TOTAL] 2.8968 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.239 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51083744 0.4891625 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 724/1324 | B: 621/1235 | C: 652/1396 [LOSS Ex1] A: 0.62267 | B: 0.60758 | C: 0.59931 [LOGITS Ex2 A] Mean Abs: 2.323 | Max: 6.978 [LOSS Ex2] A: 0.09701 | B: 0.28267 | C: 0.20271 ** [JOINT LOSS] ** : 0.803986 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003497 | Grad Max: 0.174659 -> Layer: shared_layers.0.bias | Grad Mean: 0.386331 | Grad Max: 2.338924 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002203 | Grad Max: 0.006178 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000901 | Grad Max: 0.000901 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002335 | Grad Max: 0.620205 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042604 | Grad Max: 3.467451 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000219 | Grad Max: 0.006529 -> Layer: exit2_layers.3.bias | Grad Mean: 0.021839 | Grad Max: 0.108675 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000350 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004237 | Grad Max: 0.009637 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000228 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001024 | Grad Max: 0.003292 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000438 | Grad Max: 0.001871 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013069 | Grad Max: 0.013069 [GRADIENT NORM TOTAL] 8.9340 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.026 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005842 0.4994158] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.087 [MASKS] A(Pass/Fail): 692/1356 | B: 669/1379 | C: 659/1389 [LOSS Ex1] A: 0.63167 | B: 0.60762 | C: 0.59924 [LOGITS Ex2 A] Mean Abs: 2.276 | Max: 7.005 [LOSS Ex2] A: 0.10108 | B: 0.30442 | C: 0.20411 ** [JOINT LOSS] ** : 0.816045 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003175 | Grad Max: 0.093630 -> Layer: shared_layers.0.bias | Grad Mean: 0.151761 | Grad Max: 0.712406 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.005390 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005808 | Grad Max: 0.005808 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001122 | Grad Max: 0.424390 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019491 | Grad Max: 2.396086 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000055 | Grad Max: 0.003077 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004061 | Grad Max: 0.038441 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000113 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000657 | Grad Max: 0.003043 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000097 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000164 | Grad Max: 0.000989 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000315 | Grad Max: 0.000967 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001413 | Grad Max: 0.001413 [GRADIENT NORM TOTAL] 4.6399 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.919 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54206496 0.45793504] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.083 [MASKS] A(Pass/Fail): 693/1355 | B: 650/1398 | C: 653/1395 [LOSS Ex1] A: 0.63241 | B: 0.60791 | C: 0.60214 [LOGITS Ex2 A] Mean Abs: 2.270 | Max: 6.404 [LOSS Ex2] A: 0.10195 | B: 0.30255 | C: 0.21424 ** [JOINT LOSS] ** : 0.820401 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006047 | Grad Max: 0.164568 -> Layer: shared_layers.0.bias | Grad Mean: 0.209480 | Grad Max: 1.130645 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.005638 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009551 | Grad Max: 0.009551 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001461 | Grad Max: 0.569800 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026175 | Grad Max: 3.184562 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000138 | Grad Max: 0.004643 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012366 | Grad Max: 0.060410 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000323 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002658 | Grad Max: 0.006433 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000185 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000671 | Grad Max: 0.002342 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000433 | Grad Max: 0.001930 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010903 | Grad Max: 0.010903 [GRADIENT NORM TOTAL] 5.7254 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.155 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8234019 0.17659804] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.088 [MASKS] A(Pass/Fail): 750/1298 | B: 665/1383 | C: 613/1435 [LOSS Ex1] A: 0.62575 | B: 0.60337 | C: 0.60665 [LOGITS Ex2 A] Mean Abs: 2.306 | Max: 6.980 [LOSS Ex2] A: 0.09148 | B: 0.27537 | C: 0.23455 ** [JOINT LOSS] ** : 0.812388 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003343 | Grad Max: 0.088480 -> Layer: shared_layers.0.bias | Grad Mean: 0.129182 | Grad Max: 0.785544 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002109 | Grad Max: 0.005540 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000509 | Grad Max: 0.000509 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000887 | Grad Max: 0.517749 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015294 | Grad Max: 2.878430 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.001932 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002742 | Grad Max: 0.022657 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000151 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000519 | Grad Max: 0.003035 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000092 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000128 | Grad Max: 0.000994 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000307 | Grad Max: 0.001012 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000699 | Grad Max: 0.000699 [GRADIENT NORM TOTAL] 4.4812 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.283 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50060576 0.49939418] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.087 [MASKS] A(Pass/Fail): 725/1323 | B: 623/1233 | C: 688/1360 [LOSS Ex1] A: 0.63302 | B: 0.60738 | C: 0.59785 [LOGITS Ex2 A] Mean Abs: 2.336 | Max: 5.854 [LOSS Ex2] A: 0.08158 | B: 0.27941 | C: 0.20448 ** [JOINT LOSS] ** : 0.801240 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002416 | Grad Max: 0.068023 -> Layer: shared_layers.0.bias | Grad Mean: 0.137366 | Grad Max: 0.802258 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.006178 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003084 | Grad Max: 0.003084 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000843 | Grad Max: 0.514485 -> Layer: exit2_layers.0.bias | Grad Mean: 0.014832 | Grad Max: 2.859023 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000049 | Grad Max: 0.002906 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003418 | Grad Max: 0.028509 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000137 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000537 | Grad Max: 0.002528 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000069 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000165 | Grad Max: 0.000854 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000404 | Grad Max: 0.001063 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001475 | Grad Max: 0.001475 [GRADIENT NORM TOTAL] 4.4045 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 0.905 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7484275 0.2515725] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.088 [MASKS] A(Pass/Fail): 713/1335 | B: 670/1378 | C: 640/1408 [LOSS Ex1] A: 0.62800 | B: 0.60743 | C: 0.60545 [LOGITS Ex2 A] Mean Abs: 2.313 | Max: 7.430 [LOSS Ex2] A: 0.11120 | B: 0.30203 | C: 0.21545 ** [JOINT LOSS] ** : 0.823189 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002500 | Grad Max: 0.060090 -> Layer: shared_layers.0.bias | Grad Mean: 0.100744 | Grad Max: 0.474059 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.005588 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004879 | Grad Max: 0.004879 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000821 | Grad Max: 0.216559 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013818 | Grad Max: 1.196814 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000042 | Grad Max: 0.002941 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002233 | Grad Max: 0.029770 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000125 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000382 | Grad Max: 0.002623 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000082 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000093 | Grad Max: 0.000777 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000250 | Grad Max: 0.000750 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000611 | Grad Max: 0.000611 [GRADIENT NORM TOTAL] 3.1258 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.051 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6450479 0.35495207] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.088 [MASKS] A(Pass/Fail): 590/1026 | B: 652/1396 | C: 410/966 [LOSS Ex1] A: 0.62609 | B: 0.60772 | C: 0.60208 [LOGITS Ex2 A] Mean Abs: 2.348 | Max: 8.980 [LOSS Ex2] A: 0.08850 | B: 0.30039 | C: 0.22049 ** [JOINT LOSS] ** : 0.815092 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002715 | Grad Max: 0.086382 -> Layer: shared_layers.0.bias | Grad Mean: 0.165364 | Grad Max: 1.143511 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002152 | Grad Max: 0.006745 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014752 | Grad Max: 0.014752 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001005 | Grad Max: 0.239619 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017524 | Grad Max: 1.310699 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000046 | Grad Max: 0.003006 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002544 | Grad Max: 0.019629 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000150 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000406 | Grad Max: 0.002914 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000069 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000104 | Grad Max: 0.000843 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000328 | Grad Max: 0.001161 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001811 | Grad Max: 0.001811 [GRADIENT NORM TOTAL] 4.3260 [EPOCH SUMMARY] Train Loss: 0.8126 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.7946 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.7985 -> New: 0.7946) ############################## EPOCH 172/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.285 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50825983 0.4917402 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.089 [MASKS] A(Pass/Fail): 727/1321 | B: 666/1382 | C: 643/1405 [LOSS Ex1] A: 0.62683 | B: 0.60317 | C: 0.60505 [LOGITS Ex2 A] Mean Abs: 2.359 | Max: 10.241 [LOSS Ex2] A: 0.08326 | B: 0.28046 | C: 0.19356 ** [JOINT LOSS] ** : 0.797444 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.088893 -> Layer: shared_layers.0.bias | Grad Mean: 0.132837 | Grad Max: 0.776347 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.005513 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002712 | Grad Max: 0.002712 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000878 | Grad Max: 0.358079 -> Layer: exit2_layers.0.bias | Grad Mean: 0.015519 | Grad Max: 1.988845 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000037 | Grad Max: 0.002285 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002130 | Grad Max: 0.018823 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000005 | Grad Max: 0.000117 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000372 | Grad Max: 0.002461 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000088 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000108 | Grad Max: 0.000738 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000457 | Grad Max: 0.001277 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000424 | Grad Max: 0.000424 [GRADIENT NORM TOTAL] 3.8705 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.208 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50506747 0.49493256] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 720/1328 | B: 623/1233 | C: 607/1441 [LOSS Ex1] A: 0.62348 | B: 0.60717 | C: 0.60588 [LOGITS Ex2 A] Mean Abs: 2.361 | Max: 7.271 [LOSS Ex2] A: 0.09597 | B: 0.28248 | C: 0.20764 ** [JOINT LOSS] ** : 0.807541 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002018 | Grad Max: 0.072332 -> Layer: shared_layers.0.bias | Grad Mean: 0.177001 | Grad Max: 1.103562 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002243 | Grad Max: 0.006528 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009480 | Grad Max: 0.009480 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001236 | Grad Max: 0.334792 -> Layer: exit2_layers.0.bias | Grad Mean: 0.022269 | Grad Max: 1.863441 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000071 | Grad Max: 0.005079 -> Layer: exit2_layers.3.bias | Grad Mean: 0.006693 | Grad Max: 0.059247 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000200 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001142 | Grad Max: 0.004087 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000108 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000225 | Grad Max: 0.001077 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000323 | Grad Max: 0.001192 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002820 | Grad Max: 0.002820 [GRADIENT NORM TOTAL] 4.7478 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.245 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51092565 0.48907435] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 724/1324 | B: 671/1377 | C: 654/1394 [LOSS Ex1] A: 0.62241 | B: 0.60722 | C: 0.60010 [LOGITS Ex2 A] Mean Abs: 2.366 | Max: 6.768 [LOSS Ex2] A: 0.10500 | B: 0.30383 | C: 0.21250 ** [JOINT LOSS] ** : 0.817023 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002694 | Grad Max: 0.076627 -> Layer: shared_layers.0.bias | Grad Mean: 0.145184 | Grad Max: 0.625474 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.006004 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001418 | Grad Max: 0.001418 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001326 | Grad Max: 0.435810 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023593 | Grad Max: 2.428090 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000113 | Grad Max: 0.006256 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011010 | Grad Max: 0.087691 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000219 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002204 | Grad Max: 0.005955 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000156 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000523 | Grad Max: 0.002047 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000262 | Grad Max: 0.001106 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005810 | Grad Max: 0.005810 [GRADIENT NORM TOTAL] 4.9582 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.144 | Max: 1.031 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005005 0.4994995] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.087 [MASKS] A(Pass/Fail): 691/1357 | B: 654/1394 | C: 599/1449 [LOSS Ex1] A: 0.63142 | B: 0.60750 | C: 0.60673 [LOGITS Ex2 A] Mean Abs: 2.325 | Max: 6.023 [LOSS Ex2] A: 0.09777 | B: 0.29901 | C: 0.20989 ** [JOINT LOSS] ** : 0.817436 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003783 | Grad Max: 0.121533 -> Layer: shared_layers.0.bias | Grad Mean: 0.178976 | Grad Max: 0.934758 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.006137 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011988 | Grad Max: 0.011988 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001025 | Grad Max: 0.525363 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017347 | Grad Max: 2.939861 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.003420 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004695 | Grad Max: 0.033286 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000173 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001085 | Grad Max: 0.003736 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000101 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000307 | Grad Max: 0.001141 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000301 | Grad Max: 0.001357 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006256 | Grad Max: 0.006256 [GRADIENT NORM TOTAL] 4.9799 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.140 | Max: 0.924 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.542049 0.45795098] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.562 | Std: 0.084 [MASKS] A(Pass/Fail): 693/1355 | B: 667/1381 | C: 656/1392 [LOSS Ex1] A: 0.63216 | B: 0.60295 | C: 0.60070 [LOGITS Ex2 A] Mean Abs: 2.328 | Max: 6.183 [LOSS Ex2] A: 0.09784 | B: 0.27477 | C: 0.20605 ** [JOINT LOSS] ** : 0.804823 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003550 | Grad Max: 0.124347 -> Layer: shared_layers.0.bias | Grad Mean: 0.096788 | Grad Max: 0.339987 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.005286 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005579 | Grad Max: 0.005579 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000814 | Grad Max: 0.181765 -> Layer: exit2_layers.0.bias | Grad Mean: 0.013727 | Grad Max: 1.004215 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000060 | Grad Max: 0.002581 -> Layer: exit2_layers.3.bias | Grad Mean: 0.004232 | Grad Max: 0.027731 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000009 | Grad Max: 0.000198 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000938 | Grad Max: 0.003886 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000090 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000227 | Grad Max: 0.000998 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000350 | Grad Max: 0.001299 -> Layer: exit2_layers.12.bias | Grad Mean: 0.003476 | Grad Max: 0.003476 [GRADIENT NORM TOTAL] 2.7889 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.161 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8246873 0.1753127] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.088 [MASKS] A(Pass/Fail): 750/1298 | B: 623/1233 | C: 626/1422 [LOSS Ex1] A: 0.62548 | B: 0.60693 | C: 0.60173 [LOGITS Ex2 A] Mean Abs: 2.400 | Max: 7.662 [LOSS Ex2] A: 0.09756 | B: 0.28958 | C: 0.20750 ** [JOINT LOSS] ** : 0.809594 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007012 | Grad Max: 0.185278 -> Layer: shared_layers.0.bias | Grad Mean: 0.372080 | Grad Max: 1.649750 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005707 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000468 | Grad Max: 0.000468 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002594 | Grad Max: 0.562041 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047005 | Grad Max: 3.137871 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000262 | Grad Max: 0.008213 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024968 | Grad Max: 0.127469 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000036 | Grad Max: 0.000410 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005007 | Grad Max: 0.010876 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000268 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001224 | Grad Max: 0.003818 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000537 | Grad Max: 0.001970 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016304 | Grad Max: 0.016304 [GRADIENT NORM TOTAL] 8.6520 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.148 | Max: 1.291 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50059986 0.4994001 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.087 [MASKS] A(Pass/Fail): 725/1323 | B: 672/1376 | C: 684/1364 [LOSS Ex1] A: 0.63275 | B: 0.60700 | C: 0.59733 [LOGITS Ex2 A] Mean Abs: 2.380 | Max: 7.111 [LOSS Ex2] A: 0.08244 | B: 0.30697 | C: 0.20025 ** [JOINT LOSS] ** : 0.808912 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004030 | Grad Max: 0.127049 -> Layer: shared_layers.0.bias | Grad Mean: 0.176357 | Grad Max: 0.770244 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.005463 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001446 | Grad Max: 0.001446 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001336 | Grad Max: 0.661899 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023189 | Grad Max: 3.664516 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000056 | Grad Max: 0.003375 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003082 | Grad Max: 0.038339 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000145 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000422 | Grad Max: 0.002536 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000063 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000103 | Grad Max: 0.000501 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000321 | Grad Max: 0.000896 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001188 | Grad Max: 0.001188 [GRADIENT NORM TOTAL] 6.0130 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 0.910 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7495055 0.2504945] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.088 [MASKS] A(Pass/Fail): 713/1335 | B: 654/1394 | C: 609/1439 [LOSS Ex1] A: 0.62771 | B: 0.60728 | C: 0.60725 [LOGITS Ex2 A] Mean Abs: 2.350 | Max: 7.321 [LOSS Ex2] A: 0.10817 | B: 0.29889 | C: 0.22494 ** [JOINT LOSS] ** : 0.824750 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003233 | Grad Max: 0.112172 -> Layer: shared_layers.0.bias | Grad Mean: 0.210435 | Grad Max: 1.302880 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002006 | Grad Max: 0.005293 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005176 | Grad Max: 0.005176 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001550 | Grad Max: 0.265168 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027758 | Grad Max: 1.433430 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000141 | Grad Max: 0.005802 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013676 | Grad Max: 0.088074 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000253 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002409 | Grad Max: 0.006349 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000148 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000609 | Grad Max: 0.001794 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000311 | Grad Max: 0.001255 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009492 | Grad Max: 0.009492 [GRADIENT NORM TOTAL] 5.1380 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.058 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64569914 0.35430086] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.568 | Std: 0.089 [MASKS] A(Pass/Fail): 590/1026 | B: 667/1381 | C: 636/1412 [LOSS Ex1] A: 0.62580 | B: 0.60273 | C: 0.60147 [LOGITS Ex2 A] Mean Abs: 2.431 | Max: 9.096 [LOSS Ex2] A: 0.09439 | B: 0.28739 | C: 0.19880 ** [JOINT LOSS] ** : 0.803523 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003688 | Grad Max: 0.116129 -> Layer: shared_layers.0.bias | Grad Mean: 0.264394 | Grad Max: 1.495054 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002203 | Grad Max: 0.005888 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009414 | Grad Max: 0.009415 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001731 | Grad Max: 0.337491 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031398 | Grad Max: 1.885916 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000102 | Grad Max: 0.004154 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009362 | Grad Max: 0.057009 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000216 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001873 | Grad Max: 0.005111 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000120 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000442 | Grad Max: 0.001455 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000316 | Grad Max: 0.001152 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005276 | Grad Max: 0.005277 [GRADIENT NORM TOTAL] 6.3547 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.293 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5082816 0.4917184] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.089 [MASKS] A(Pass/Fail): 727/1321 | B: 624/1232 | C: 656/1392 [LOSS Ex1] A: 0.62654 | B: 0.60672 | C: 0.59410 [LOGITS Ex2 A] Mean Abs: 2.407 | Max: 8.667 [LOSS Ex2] A: 0.07865 | B: 0.28832 | C: 0.18457 ** [JOINT LOSS] ** : 0.792968 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002636 | Grad Max: 0.075436 -> Layer: shared_layers.0.bias | Grad Mean: 0.193137 | Grad Max: 1.029533 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002170 | Grad Max: 0.005375 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003707 | Grad Max: 0.003707 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001330 | Grad Max: 0.472169 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023576 | Grad Max: 2.629771 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000065 | Grad Max: 0.004078 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005903 | Grad Max: 0.056526 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000209 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001223 | Grad Max: 0.004566 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000110 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000308 | Grad Max: 0.001401 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001189 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004613 | Grad Max: 0.004613 [GRADIENT NORM TOTAL] 5.4420 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.215 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5050559 0.49494413] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 720/1328 | B: 672/1376 | C: 653/1395 [LOSS Ex1] A: 0.62318 | B: 0.60680 | C: 0.60623 [LOGITS Ex2 A] Mean Abs: 2.381 | Max: 8.088 [LOSS Ex2] A: 0.09362 | B: 0.30458 | C: 0.21357 ** [JOINT LOSS] ** : 0.815995 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004453 | Grad Max: 0.150462 -> Layer: shared_layers.0.bias | Grad Mean: 0.326038 | Grad Max: 1.927025 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.006085 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003306 | Grad Max: 0.003306 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002014 | Grad Max: 0.647694 -> Layer: exit2_layers.0.bias | Grad Mean: 0.036248 | Grad Max: 3.616336 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000165 | Grad Max: 0.005386 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016194 | Grad Max: 0.095264 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000329 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003112 | Grad Max: 0.006742 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000189 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000774 | Grad Max: 0.002602 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000401 | Grad Max: 0.001553 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011968 | Grad Max: 0.011968 [GRADIENT NORM TOTAL] 7.8843 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.252 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5110809 0.4889191] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 724/1324 | B: 654/1394 | C: 682/1366 [LOSS Ex1] A: 0.62211 | B: 0.60708 | C: 0.59959 [LOGITS Ex2 A] Mean Abs: 2.384 | Max: 8.085 [LOSS Ex2] A: 0.09780 | B: 0.30012 | C: 0.20542 ** [JOINT LOSS] ** : 0.810707 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002151 | Grad Max: 0.057097 -> Layer: shared_layers.0.bias | Grad Mean: 0.145990 | Grad Max: 0.723725 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.005871 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004687 | Grad Max: 0.004687 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001068 | Grad Max: 0.484310 -> Layer: exit2_layers.0.bias | Grad Mean: 0.019121 | Grad Max: 2.675334 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000045 | Grad Max: 0.003850 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003442 | Grad Max: 0.044321 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000157 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000575 | Grad Max: 0.003139 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000079 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000115 | Grad Max: 0.000704 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000262 | Grad Max: 0.000859 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000190 | Grad Max: 0.000190 [GRADIENT NORM TOTAL] 5.0623 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.037 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5005007 0.4994993] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.087 [MASKS] A(Pass/Fail): 691/1357 | B: 667/1381 | C: 653/1395 [LOSS Ex1] A: 0.63112 | B: 0.60253 | C: 0.59821 [LOGITS Ex2 A] Mean Abs: 2.338 | Max: 6.049 [LOSS Ex2] A: 0.09320 | B: 0.26857 | C: 0.22304 ** [JOINT LOSS] ** : 0.805557 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003390 | Grad Max: 0.114803 -> Layer: shared_layers.0.bias | Grad Mean: 0.249121 | Grad Max: 1.499751 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005247 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000391 | Grad Max: 0.000391 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001714 | Grad Max: 0.217855 -> Layer: exit2_layers.0.bias | Grad Mean: 0.030513 | Grad Max: 1.210678 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000144 | Grad Max: 0.005468 -> Layer: exit2_layers.3.bias | Grad Mean: 0.014055 | Grad Max: 0.089696 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000219 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002455 | Grad Max: 0.005649 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000165 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000563 | Grad Max: 0.002173 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000352 | Grad Max: 0.001323 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006647 | Grad Max: 0.006647 [GRADIENT NORM TOTAL] 5.7072 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.930 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5420251 0.4579749] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.084 [MASKS] A(Pass/Fail): 693/1355 | B: 624/1232 | C: 431/945 [LOSS Ex1] A: 0.63188 | B: 0.60653 | C: 0.60182 [LOGITS Ex2 A] Mean Abs: 2.292 | Max: 6.716 [LOSS Ex2] A: 0.10084 | B: 0.28649 | C: 0.22106 ** [JOINT LOSS] ** : 0.816211 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005758 | Grad Max: 0.148356 -> Layer: shared_layers.0.bias | Grad Mean: 0.440257 | Grad Max: 1.900964 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.005629 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004373 | Grad Max: 0.004373 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002831 | Grad Max: 0.348480 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052028 | Grad Max: 1.933696 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.008635 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029759 | Grad Max: 0.153114 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000040 | Grad Max: 0.000507 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005640 | Grad Max: 0.012961 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000320 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001278 | Grad Max: 0.004376 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000612 | Grad Max: 0.001868 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017045 | Grad Max: 0.017045 [GRADIENT NORM TOTAL] 9.2758 [EPOCH SUMMARY] Train Loss: 0.8095 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.7921 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.7946 -> New: 0.7921) ############################## EPOCH 173/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.168 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8258951 0.17410488] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.089 [MASKS] A(Pass/Fail): 750/1298 | B: 672/1376 | C: 661/1387 [LOSS Ex1] A: 0.62519 | B: 0.60662 | C: 0.59474 [LOGITS Ex2 A] Mean Abs: 2.388 | Max: 7.161 [LOSS Ex2] A: 0.09445 | B: 0.29393 | C: 0.19434 ** [JOINT LOSS] ** : 0.803090 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004448 | Grad Max: 0.139613 -> Layer: shared_layers.0.bias | Grad Mean: 0.152607 | Grad Max: 0.914248 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002214 | Grad Max: 0.006196 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005968 | Grad Max: 0.005968 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001225 | Grad Max: 0.203034 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020483 | Grad Max: 1.121511 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000058 | Grad Max: 0.002577 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003199 | Grad Max: 0.038052 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000007 | Grad Max: 0.000178 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000416 | Grad Max: 0.002719 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000081 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000101 | Grad Max: 0.000827 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000342 | Grad Max: 0.001115 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001111 | Grad Max: 0.001111 [GRADIENT NORM TOTAL] 4.2865 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.298 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5006913 0.49930874] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.087 [MASKS] A(Pass/Fail): 726/1322 | B: 654/1394 | C: 654/1394 [LOSS Ex1] A: 0.63249 | B: 0.60690 | C: 0.59669 [LOGITS Ex2 A] Mean Abs: 2.406 | Max: 7.352 [LOSS Ex2] A: 0.08458 | B: 0.29795 | C: 0.17639 ** [JOINT LOSS] ** : 0.798331 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004389 | Grad Max: 0.157060 -> Layer: shared_layers.0.bias | Grad Mean: 0.422676 | Grad Max: 2.129873 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.005418 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006272 | Grad Max: 0.006272 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002795 | Grad Max: 0.536297 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051515 | Grad Max: 2.990288 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000250 | Grad Max: 0.008156 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025069 | Grad Max: 0.137210 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000371 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004846 | Grad Max: 0.011400 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000257 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001159 | Grad Max: 0.003870 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000503 | Grad Max: 0.002109 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015375 | Grad Max: 0.015375 [GRADIENT NORM TOTAL] 9.9766 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 0.915 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75030535 0.24969465] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.089 [MASKS] A(Pass/Fail): 713/1335 | B: 667/1381 | C: 628/1420 [LOSS Ex1] A: 0.62744 | B: 0.60236 | C: 0.60252 [LOGITS Ex2 A] Mean Abs: 2.357 | Max: 8.149 [LOSS Ex2] A: 0.10444 | B: 0.26986 | C: 0.20592 ** [JOINT LOSS] ** : 0.804179 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002383 | Grad Max: 0.061710 -> Layer: shared_layers.0.bias | Grad Mean: 0.160829 | Grad Max: 0.504162 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002109 | Grad Max: 0.005679 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001291 | Grad Max: 0.001291 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001039 | Grad Max: 0.551152 -> Layer: exit2_layers.0.bias | Grad Mean: 0.018399 | Grad Max: 3.043483 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000040 | Grad Max: 0.002550 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002594 | Grad Max: 0.027999 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000005 | Grad Max: 0.000120 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000436 | Grad Max: 0.002496 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000081 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000097 | Grad Max: 0.000675 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000355 | Grad Max: 0.001160 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000073 | Grad Max: 0.000073 [GRADIENT NORM TOTAL] 5.1023 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.064 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64606684 0.35393313] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.089 [MASKS] A(Pass/Fail): 590/1026 | B: 624/1232 | C: 682/1366 [LOSS Ex1] A: 0.62552 | B: 0.60635 | C: 0.59694 [LOGITS Ex2 A] Mean Abs: 2.407 | Max: 11.860 [LOSS Ex2] A: 0.09794 | B: 0.29249 | C: 0.20100 ** [JOINT LOSS] ** : 0.806747 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005374 | Grad Max: 0.139659 -> Layer: shared_layers.0.bias | Grad Mean: 0.348334 | Grad Max: 1.310389 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002114 | Grad Max: 0.005503 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006715 | Grad Max: 0.006715 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002275 | Grad Max: 0.288180 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041835 | Grad Max: 1.609957 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.008245 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024052 | Grad Max: 0.127463 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000378 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004742 | Grad Max: 0.010052 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000287 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001098 | Grad Max: 0.003591 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000572 | Grad Max: 0.001811 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015500 | Grad Max: 0.015500 [GRADIENT NORM TOTAL] 7.0208 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.300 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083333 0.49166664] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.089 [MASKS] A(Pass/Fail): 728/1320 | B: 672/1376 | C: 688/1360 [LOSS Ex1] A: 0.62628 | B: 0.60645 | C: 0.59873 [LOGITS Ex2 A] Mean Abs: 2.414 | Max: 9.473 [LOSS Ex2] A: 0.07942 | B: 0.30209 | C: 0.22929 ** [JOINT LOSS] ** : 0.814088 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003214 | Grad Max: 0.138558 -> Layer: shared_layers.0.bias | Grad Mean: 0.238418 | Grad Max: 1.130593 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.005881 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001118 | Grad Max: 0.001118 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001741 | Grad Max: 0.226426 -> Layer: exit2_layers.0.bias | Grad Mean: 0.031396 | Grad Max: 1.243055 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000156 | Grad Max: 0.006353 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015321 | Grad Max: 0.100120 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000258 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002691 | Grad Max: 0.006984 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000186 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000577 | Grad Max: 0.002620 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000269 | Grad Max: 0.001072 -> Layer: exit2_layers.12.bias | Grad Mean: 0.005503 | Grad Max: 0.005503 [GRADIENT NORM TOTAL] 5.6598 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.220 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5048422 0.49515775] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.089 [MASKS] A(Pass/Fail): 721/1327 | B: 654/1394 | C: 610/1438 [LOSS Ex1] A: 0.62292 | B: 0.60673 | C: 0.61050 [LOGITS Ex2 A] Mean Abs: 2.399 | Max: 8.406 [LOSS Ex2] A: 0.10007 | B: 0.29686 | C: 0.21721 ** [JOINT LOSS] ** : 0.818102 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002575 | Grad Max: 0.068170 -> Layer: shared_layers.0.bias | Grad Mean: 0.120285 | Grad Max: 0.868813 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005692 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003775 | Grad Max: 0.003775 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000933 | Grad Max: 0.219625 -> Layer: exit2_layers.0.bias | Grad Mean: 0.016010 | Grad Max: 1.212247 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.002780 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005247 | Grad Max: 0.032508 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000180 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001074 | Grad Max: 0.004608 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000110 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000236 | Grad Max: 0.001081 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000243 | Grad Max: 0.000912 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002555 | Grad Max: 0.002555 [GRADIENT NORM TOTAL] 3.3135 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51124424 0.48875576] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 724/1324 | B: 667/1381 | C: 595/1453 [LOSS Ex1] A: 0.62185 | B: 0.60219 | C: 0.60831 [LOGITS Ex2 A] Mean Abs: 2.359 | Max: 6.335 [LOSS Ex2] A: 0.10041 | B: 0.28143 | C: 0.21474 ** [JOINT LOSS] ** : 0.809644 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004228 | Grad Max: 0.178674 -> Layer: shared_layers.0.bias | Grad Mean: 0.414080 | Grad Max: 2.277112 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.005422 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000022 | Grad Max: 0.000022 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002792 | Grad Max: 0.594998 -> Layer: exit2_layers.0.bias | Grad Mean: 0.050532 | Grad Max: 3.279720 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000254 | Grad Max: 0.009365 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025597 | Grad Max: 0.147488 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000350 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004689 | Grad Max: 0.010157 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000282 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001121 | Grad Max: 0.003930 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000536 | Grad Max: 0.001809 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016120 | Grad Max: 0.016120 [GRADIENT NORM TOTAL] 9.9897 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.145 | Max: 1.042 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004989 0.4995011] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.087 [MASKS] A(Pass/Fail): 691/1357 | B: 624/1232 | C: 633/1415 [LOSS Ex1] A: 0.63088 | B: 0.60618 | C: 0.59985 [LOGITS Ex2 A] Mean Abs: 2.352 | Max: 6.426 [LOSS Ex2] A: 0.09006 | B: 0.28683 | C: 0.19962 ** [JOINT LOSS] ** : 0.804471 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002835 | Grad Max: 0.109447 -> Layer: shared_layers.0.bias | Grad Mean: 0.272278 | Grad Max: 1.611253 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.005420 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003329 | Grad Max: 0.003329 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001873 | Grad Max: 0.491988 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034067 | Grad Max: 2.759935 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000163 | Grad Max: 0.005257 -> Layer: exit2_layers.3.bias | Grad Mean: 0.016526 | Grad Max: 0.081773 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000330 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003127 | Grad Max: 0.007990 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000195 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000685 | Grad Max: 0.002958 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000351 | Grad Max: 0.001652 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009253 | Grad Max: 0.009253 [GRADIENT NORM TOTAL] 6.8655 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.934 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5418613 0.45813867] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.084 [MASKS] A(Pass/Fail): 692/1356 | B: 673/1375 | C: 660/1388 [LOSS Ex1] A: 0.63165 | B: 0.60629 | C: 0.60310 [LOGITS Ex2 A] Mean Abs: 2.372 | Max: 6.515 [LOSS Ex2] A: 0.10106 | B: 0.30193 | C: 0.23335 ** [JOINT LOSS] ** : 0.825793 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004287 | Grad Max: 0.224728 -> Layer: shared_layers.0.bias | Grad Mean: 0.466636 | Grad Max: 2.553741 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005694 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000572 | Grad Max: 0.000572 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002889 | Grad Max: 0.409577 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053786 | Grad Max: 2.283252 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000303 | Grad Max: 0.010516 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030774 | Grad Max: 0.176105 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000429 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005618 | Grad Max: 0.013139 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000320 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001270 | Grad Max: 0.004826 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000488 | Grad Max: 0.001660 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015575 | Grad Max: 0.015575 [GRADIENT NORM TOTAL] 10.0492 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.174 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.82687914 0.1731208 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.089 [MASKS] A(Pass/Fail): 750/1298 | B: 654/1394 | C: 647/1401 [LOSS Ex1] A: 0.62497 | B: 0.60656 | C: 0.60326 [LOGITS Ex2 A] Mean Abs: 2.405 | Max: 9.016 [LOSS Ex2] A: 0.08698 | B: 0.29260 | C: 0.22652 ** [JOINT LOSS] ** : 0.813629 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003399 | Grad Max: 0.124184 -> Layer: shared_layers.0.bias | Grad Mean: 0.387512 | Grad Max: 1.615778 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.005461 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001370 | Grad Max: 0.001370 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002517 | Grad Max: 0.400374 -> Layer: exit2_layers.0.bias | Grad Mean: 0.046357 | Grad Max: 2.222160 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000242 | Grad Max: 0.009997 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024571 | Grad Max: 0.159562 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000031 | Grad Max: 0.000320 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004505 | Grad Max: 0.009603 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000234 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001032 | Grad Max: 0.003586 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000395 | Grad Max: 0.001622 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012861 | Grad Max: 0.012861 [GRADIENT NORM TOTAL] 8.8461 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 1.305 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50070107 0.49929893] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.088 [MASKS] A(Pass/Fail): 726/1322 | B: 667/1381 | C: 619/1429 [LOSS Ex1] A: 0.63227 | B: 0.60202 | C: 0.60081 [LOGITS Ex2 A] Mean Abs: 2.387 | Max: 7.196 [LOSS Ex2] A: 0.07957 | B: 0.27498 | C: 0.20567 ** [JOINT LOSS] ** : 0.798440 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004516 | Grad Max: 0.159264 -> Layer: shared_layers.0.bias | Grad Mean: 0.380336 | Grad Max: 1.987584 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.005884 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008561 | Grad Max: 0.008561 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002322 | Grad Max: 0.279119 -> Layer: exit2_layers.0.bias | Grad Mean: 0.042070 | Grad Max: 1.558046 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000223 | Grad Max: 0.007632 -> Layer: exit2_layers.3.bias | Grad Mean: 0.022741 | Grad Max: 0.125606 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000387 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004473 | Grad Max: 0.010318 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000011 | Grad Max: 0.000247 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001044 | Grad Max: 0.003625 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000505 | Grad Max: 0.001983 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014824 | Grad Max: 0.014824 [GRADIENT NORM TOTAL] 7.8873 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.149 | Max: 0.919 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75103396 0.24896605] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.089 [MASKS] A(Pass/Fail): 713/1335 | B: 624/1232 | C: 675/1373 [LOSS Ex1] A: 0.62721 | B: 0.60601 | C: 0.59772 [LOGITS Ex2 A] Mean Abs: 2.357 | Max: 7.982 [LOSS Ex2] A: 0.10326 | B: 0.29276 | C: 0.19197 ** [JOINT LOSS] ** : 0.806317 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005654 | Grad Max: 0.160190 -> Layer: shared_layers.0.bias | Grad Mean: 0.461844 | Grad Max: 2.207402 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002091 | Grad Max: 0.005730 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002510 | Grad Max: 0.002510 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002726 | Grad Max: 0.642411 -> Layer: exit2_layers.0.bias | Grad Mean: 0.049560 | Grad Max: 3.565703 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000252 | Grad Max: 0.006581 -> Layer: exit2_layers.3.bias | Grad Mean: 0.025078 | Grad Max: 0.119957 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000034 | Grad Max: 0.000433 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004791 | Grad Max: 0.010732 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000254 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001068 | Grad Max: 0.004003 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000496 | Grad Max: 0.001836 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014103 | Grad Max: 0.014103 [GRADIENT NORM TOTAL] 10.0062 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.069 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64633965 0.35366032] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.089 [MASKS] A(Pass/Fail): 590/1026 | B: 673/1375 | C: 656/1392 [LOSS Ex1] A: 0.62531 | B: 0.60614 | C: 0.59688 [LOGITS Ex2 A] Mean Abs: 2.449 | Max: 9.995 [LOSS Ex2] A: 0.08980 | B: 0.30120 | C: 0.18938 ** [JOINT LOSS] ** : 0.802903 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002776 | Grad Max: 0.098225 -> Layer: shared_layers.0.bias | Grad Mean: 0.204681 | Grad Max: 1.158149 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.006485 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013640 | Grad Max: 0.013640 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001500 | Grad Max: 0.324999 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026844 | Grad Max: 1.803301 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000119 | Grad Max: 0.006004 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011645 | Grad Max: 0.070321 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000259 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002212 | Grad Max: 0.006130 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000170 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000507 | Grad Max: 0.001999 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000310 | Grad Max: 0.001203 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006012 | Grad Max: 0.006012 [GRADIENT NORM TOTAL] 5.2999 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.307 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083018 0.49169818] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 729/1319 | B: 654/1394 | C: 407/969 [LOSS Ex1] A: 0.62606 | B: 0.60641 | C: 0.61134 [LOGITS Ex2 A] Mean Abs: 2.413 | Max: 8.114 [LOSS Ex2] A: 0.08254 | B: 0.29790 | C: 0.20956 ** [JOINT LOSS] ** : 0.811271 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005017 | Grad Max: 0.174433 -> Layer: shared_layers.0.bias | Grad Mean: 0.387515 | Grad Max: 2.299068 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002023 | Grad Max: 0.005464 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000498 | Grad Max: 0.000498 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002575 | Grad Max: 0.523853 -> Layer: exit2_layers.0.bias | Grad Mean: 0.047191 | Grad Max: 2.926193 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000238 | Grad Max: 0.007033 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023502 | Grad Max: 0.117557 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000032 | Grad Max: 0.000345 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004620 | Grad Max: 0.009617 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000227 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001098 | Grad Max: 0.003287 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000444 | Grad Max: 0.001964 -> Layer: exit2_layers.12.bias | Grad Mean: 0.013499 | Grad Max: 0.013499 [GRADIENT NORM TOTAL] 9.2888 [EPOCH SUMMARY] Train Loss: 0.8084 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.7910 | Alpha: 0.5500 !!! BEST MODEL SAVED !!! (Old: 0.7921 -> New: 0.7910) ############################## EPOCH 174/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.225 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50467604 0.49532393] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.089 [MASKS] A(Pass/Fail): 721/1327 | B: 667/1381 | C: 640/1408 [LOSS Ex1] A: 0.62271 | B: 0.60188 | C: 0.60506 [LOGITS Ex2 A] Mean Abs: 2.372 | Max: 6.969 [LOSS Ex2] A: 0.09637 | B: 0.27716 | C: 0.21620 ** [JOINT LOSS] ** : 0.806458 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002434 | Grad Max: 0.082042 -> Layer: shared_layers.0.bias | Grad Mean: 0.278686 | Grad Max: 1.141167 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002201 | Grad Max: 0.005604 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003997 | Grad Max: 0.003997 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001827 | Grad Max: 0.708885 -> Layer: exit2_layers.0.bias | Grad Mean: 0.033206 | Grad Max: 3.923262 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000165 | Grad Max: 0.006927 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017096 | Grad Max: 0.109724 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000021 | Grad Max: 0.000273 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003157 | Grad Max: 0.008089 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000008 | Grad Max: 0.000208 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000712 | Grad Max: 0.002567 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000337 | Grad Max: 0.001605 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009860 | Grad Max: 0.009860 [GRADIENT NORM TOTAL] 7.3684 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.263 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51130056 0.48869938] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 724/1324 | B: 624/1232 | C: 626/1422 [LOSS Ex1] A: 0.62164 | B: 0.60587 | C: 0.60500 [LOGITS Ex2 A] Mean Abs: 2.345 | Max: 6.300 [LOSS Ex2] A: 0.10542 | B: 0.28678 | C: 0.23278 ** [JOINT LOSS] ** : 0.819162 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003229 | Grad Max: 0.083460 -> Layer: shared_layers.0.bias | Grad Mean: 0.233306 | Grad Max: 0.890641 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002138 | Grad Max: 0.005564 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003691 | Grad Max: 0.003691 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001595 | Grad Max: 0.205006 -> Layer: exit2_layers.0.bias | Grad Mean: 0.028564 | Grad Max: 1.123028 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000160 | Grad Max: 0.005607 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015904 | Grad Max: 0.094219 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000019 | Grad Max: 0.000239 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002890 | Grad Max: 0.006802 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000217 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000704 | Grad Max: 0.002979 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000342 | Grad Max: 0.001251 -> Layer: exit2_layers.12.bias | Grad Mean: 0.010904 | Grad Max: 0.010904 [GRADIENT NORM TOTAL] 5.0205 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.046 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004218 0.49957815] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.088 [MASKS] A(Pass/Fail): 691/1357 | B: 673/1375 | C: 662/1386 [LOSS Ex1] A: 0.63067 | B: 0.60600 | C: 0.59760 [LOGITS Ex2 A] Mean Abs: 2.370 | Max: 6.222 [LOSS Ex2] A: 0.09067 | B: 0.29836 | C: 0.21602 ** [JOINT LOSS] ** : 0.813110 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005303 | Grad Max: 0.234053 -> Layer: shared_layers.0.bias | Grad Mean: 0.544093 | Grad Max: 2.706027 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.005312 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003653 | Grad Max: 0.003653 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003497 | Grad Max: 0.511352 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065371 | Grad Max: 2.826324 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000366 | Grad Max: 0.013655 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037445 | Grad Max: 0.214354 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000460 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007003 | Grad Max: 0.014054 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000344 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001648 | Grad Max: 0.005145 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000670 | Grad Max: 0.002330 -> Layer: exit2_layers.12.bias | Grad Mean: 0.021231 | Grad Max: 0.021231 [GRADIENT NORM TOTAL] 11.9644 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.937 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5417878 0.45821217] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.084 [MASKS] A(Pass/Fail): 692/1356 | B: 654/1394 | C: 643/1405 [LOSS Ex1] A: 0.63145 | B: 0.60627 | C: 0.60212 [LOGITS Ex2 A] Mean Abs: 2.358 | Max: 6.586 [LOSS Ex2] A: 0.09708 | B: 0.29379 | C: 0.20501 ** [JOINT LOSS] ** : 0.811908 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004304 | Grad Max: 0.178303 -> Layer: shared_layers.0.bias | Grad Mean: 0.441285 | Grad Max: 2.402687 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002037 | Grad Max: 0.005524 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002884 | Grad Max: 0.002884 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002975 | Grad Max: 0.445887 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054982 | Grad Max: 2.461746 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000302 | Grad Max: 0.011856 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031080 | Grad Max: 0.181284 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000396 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005838 | Grad Max: 0.012074 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000276 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001404 | Grad Max: 0.004420 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000557 | Grad Max: 0.002183 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018658 | Grad Max: 0.018658 [GRADIENT NORM TOTAL] 10.0137 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.179 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.82776946 0.17223054] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.089 [MASKS] A(Pass/Fail): 750/1298 | B: 669/1379 | C: 645/1403 [LOSS Ex1] A: 0.62476 | B: 0.60174 | C: 0.60144 [LOGITS Ex2 A] Mean Abs: 2.364 | Max: 9.482 [LOSS Ex2] A: 0.08808 | B: 0.27486 | C: 0.20181 ** [JOINT LOSS] ** : 0.797563 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003558 | Grad Max: 0.154165 -> Layer: shared_layers.0.bias | Grad Mean: 0.346718 | Grad Max: 2.100041 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002188 | Grad Max: 0.005277 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002500 | Grad Max: 0.002500 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002253 | Grad Max: 0.542258 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041421 | Grad Max: 3.020621 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000201 | Grad Max: 0.007901 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020297 | Grad Max: 0.116531 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000026 | Grad Max: 0.000356 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003889 | Grad Max: 0.008700 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000227 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000898 | Grad Max: 0.003631 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000411 | Grad Max: 0.001863 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011899 | Grad Max: 0.011899 [GRADIENT NORM TOTAL] 8.6662 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.311 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007146 0.49928543] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.088 [MASKS] A(Pass/Fail): 726/1322 | B: 624/1232 | C: 682/1366 [LOSS Ex1] A: 0.63207 | B: 0.60573 | C: 0.59740 [LOGITS Ex2 A] Mean Abs: 2.362 | Max: 6.954 [LOSS Ex2] A: 0.08569 | B: 0.29429 | C: 0.20382 ** [JOINT LOSS] ** : 0.806336 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004936 | Grad Max: 0.197630 -> Layer: shared_layers.0.bias | Grad Mean: 0.466388 | Grad Max: 2.695253 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002074 | Grad Max: 0.005437 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000578 | Grad Max: 0.000578 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003033 | Grad Max: 0.595597 -> Layer: exit2_layers.0.bias | Grad Mean: 0.055163 | Grad Max: 3.350343 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000284 | Grad Max: 0.008224 -> Layer: exit2_layers.3.bias | Grad Mean: 0.029056 | Grad Max: 0.144630 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000436 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005638 | Grad Max: 0.011885 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000334 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001311 | Grad Max: 0.005143 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000593 | Grad Max: 0.002007 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017711 | Grad Max: 0.017711 [GRADIENT NORM TOTAL] 10.7980 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 0.923 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.751702 0.24829794] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.089 [MASKS] A(Pass/Fail): 713/1335 | B: 673/1375 | C: 666/1382 [LOSS Ex1] A: 0.62701 | B: 0.60588 | C: 0.60085 [LOGITS Ex2 A] Mean Abs: 2.347 | Max: 6.592 [LOSS Ex2] A: 0.10130 | B: 0.29487 | C: 0.20836 ** [JOINT LOSS] ** : 0.812756 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003995 | Grad Max: 0.113783 -> Layer: shared_layers.0.bias | Grad Mean: 0.149579 | Grad Max: 0.835968 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002021 | Grad Max: 0.005851 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004469 | Grad Max: 0.004469 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001037 | Grad Max: 0.479124 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017139 | Grad Max: 2.666783 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000052 | Grad Max: 0.002858 -> Layer: exit2_layers.3.bias | Grad Mean: 0.002669 | Grad Max: 0.026688 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000174 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000395 | Grad Max: 0.002859 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000003 | Grad Max: 0.000096 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000107 | Grad Max: 0.001039 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000323 | Grad Max: 0.001079 -> Layer: exit2_layers.12.bias | Grad Mean: 0.000497 | Grad Max: 0.000497 [GRADIENT NORM TOTAL] 4.6047 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.074 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64667594 0.35332406] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.089 [MASKS] A(Pass/Fail): 590/1026 | B: 655/1393 | C: 656/1392 [LOSS Ex1] A: 0.62510 | B: 0.60614 | C: 0.59902 [LOGITS Ex2 A] Mean Abs: 2.417 | Max: 9.562 [LOSS Ex2] A: 0.09288 | B: 0.30240 | C: 0.21965 ** [JOINT LOSS] ** : 0.815066 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004642 | Grad Max: 0.233178 -> Layer: shared_layers.0.bias | Grad Mean: 0.555941 | Grad Max: 3.037862 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002088 | Grad Max: 0.005479 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005678 | Grad Max: 0.005678 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003563 | Grad Max: 0.644318 -> Layer: exit2_layers.0.bias | Grad Mean: 0.066183 | Grad Max: 3.560100 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000330 | Grad Max: 0.011111 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033910 | Grad Max: 0.176565 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000499 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006419 | Grad Max: 0.014353 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000016 | Grad Max: 0.000321 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001497 | Grad Max: 0.005059 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000580 | Grad Max: 0.002015 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018931 | Grad Max: 0.018931 [GRADIENT NORM TOTAL] 12.8964 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.312 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083137 0.49168622] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 729/1319 | B: 670/1378 | C: 652/1396 [LOSS Ex1] A: 0.62586 | B: 0.60161 | C: 0.60149 [LOGITS Ex2 A] Mean Abs: 2.389 | Max: 10.382 [LOSS Ex2] A: 0.08314 | B: 0.27545 | C: 0.19947 ** [JOINT LOSS] ** : 0.795673 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004174 | Grad Max: 0.178013 -> Layer: shared_layers.0.bias | Grad Mean: 0.350344 | Grad Max: 2.236244 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002156 | Grad Max: 0.005578 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000825 | Grad Max: 0.000825 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.474007 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041152 | Grad Max: 2.632708 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000172 | Grad Max: 0.005502 -> Layer: exit2_layers.3.bias | Grad Mean: 0.017662 | Grad Max: 0.086431 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000289 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003561 | Grad Max: 0.007923 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000242 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000903 | Grad Max: 0.003356 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000460 | Grad Max: 0.001724 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011630 | Grad Max: 0.011630 [GRADIENT NORM TOTAL] 8.5741 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.230 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5045797 0.49542025] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.089 [MASKS] A(Pass/Fail): 721/1327 | B: 625/1231 | C: 613/1435 [LOSS Ex1] A: 0.62251 | B: 0.60560 | C: 0.60563 [LOGITS Ex2 A] Mean Abs: 2.330 | Max: 7.217 [LOSS Ex2] A: 0.09691 | B: 0.29461 | C: 0.19642 ** [JOINT LOSS] ** : 0.807227 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004537 | Grad Max: 0.143025 -> Layer: shared_layers.0.bias | Grad Mean: 0.456747 | Grad Max: 2.066792 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002139 | Grad Max: 0.006129 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003349 | Grad Max: 0.003349 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002837 | Grad Max: 0.338709 -> Layer: exit2_layers.0.bias | Grad Mean: 0.052671 | Grad Max: 1.897464 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000308 | Grad Max: 0.010592 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031361 | Grad Max: 0.169184 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000039 | Grad Max: 0.000485 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005685 | Grad Max: 0.012963 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000014 | Grad Max: 0.000313 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001254 | Grad Max: 0.004741 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000539 | Grad Max: 0.002037 -> Layer: exit2_layers.12.bias | Grad Mean: 0.015977 | Grad Max: 0.015977 [GRADIENT NORM TOTAL] 9.5148 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.267 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5113761 0.48862392] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 725/1323 | B: 673/1375 | C: 631/1417 [LOSS Ex1] A: 0.62145 | B: 0.60576 | C: 0.60106 [LOGITS Ex2 A] Mean Abs: 2.332 | Max: 6.951 [LOSS Ex2] A: 0.10004 | B: 0.29175 | C: 0.20735 ** [JOINT LOSS] ** : 0.809137 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003689 | Grad Max: 0.147577 -> Layer: shared_layers.0.bias | Grad Mean: 0.342668 | Grad Max: 1.833389 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005769 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008496 | Grad Max: 0.008496 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002229 | Grad Max: 0.252251 -> Layer: exit2_layers.0.bias | Grad Mean: 0.040129 | Grad Max: 1.398212 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000232 | Grad Max: 0.008356 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023496 | Grad Max: 0.123836 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000028 | Grad Max: 0.000380 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004265 | Grad Max: 0.011227 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000235 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001013 | Grad Max: 0.003429 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000469 | Grad Max: 0.001773 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014706 | Grad Max: 0.014706 [GRADIENT NORM TOTAL] 7.2806 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.049 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003877 0.4996123] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.088 [MASKS] A(Pass/Fail): 691/1357 | B: 656/1392 | C: 662/1386 [LOSS Ex1] A: 0.63049 | B: 0.60602 | C: 0.60145 [LOGITS Ex2 A] Mean Abs: 2.359 | Max: 6.432 [LOSS Ex2] A: 0.09392 | B: 0.30380 | C: 0.20647 ** [JOINT LOSS] ** : 0.814046 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007464 | Grad Max: 0.242551 -> Layer: shared_layers.0.bias | Grad Mean: 0.675581 | Grad Max: 3.328815 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002011 | Grad Max: 0.005196 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006622 | Grad Max: 0.006622 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004294 | Grad Max: 0.732520 -> Layer: exit2_layers.0.bias | Grad Mean: 0.079103 | Grad Max: 4.099559 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000387 | Grad Max: 0.013265 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039462 | Grad Max: 0.234707 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000603 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007615 | Grad Max: 0.017632 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000360 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001780 | Grad Max: 0.005444 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000771 | Grad Max: 0.002495 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023921 | Grad Max: 0.023921 [GRADIENT NORM TOTAL] 15.2895 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.141 | Max: 0.940 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5417358 0.45826414] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.084 [MASKS] A(Pass/Fail): 692/1356 | B: 670/1378 | C: 661/1387 [LOSS Ex1] A: 0.63128 | B: 0.60149 | C: 0.59692 [LOGITS Ex2 A] Mean Abs: 2.359 | Max: 6.492 [LOSS Ex2] A: 0.10122 | B: 0.28411 | C: 0.20673 ** [JOINT LOSS] ** : 0.807251 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006528 | Grad Max: 0.296593 -> Layer: shared_layers.0.bias | Grad Mean: 0.740089 | Grad Max: 3.788008 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005183 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006553 | Grad Max: 0.006553 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004550 | Grad Max: 0.834221 -> Layer: exit2_layers.0.bias | Grad Mean: 0.084174 | Grad Max: 4.619366 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000444 | Grad Max: 0.015686 -> Layer: exit2_layers.3.bias | Grad Mean: 0.046232 | Grad Max: 0.250357 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000577 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008789 | Grad Max: 0.018217 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000414 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002106 | Grad Max: 0.006701 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000840 | Grad Max: 0.002555 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027808 | Grad Max: 0.027808 [GRADIENT NORM TOTAL] 16.3541 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.183 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.82852674 0.17147328] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.089 [MASKS] A(Pass/Fail): 749/1299 | B: 625/1231 | C: 471/905 [LOSS Ex1] A: 0.62459 | B: 0.60549 | C: 0.59400 [LOGITS Ex2 A] Mean Abs: 2.371 | Max: 7.811 [LOSS Ex2] A: 0.08739 | B: 0.28163 | C: 0.19147 ** [JOINT LOSS] ** : 0.794856 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003570 | Grad Max: 0.093626 -> Layer: shared_layers.0.bias | Grad Mean: 0.197482 | Grad Max: 0.787393 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.005780 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002544 | Grad Max: 0.002544 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001468 | Grad Max: 0.418029 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026875 | Grad Max: 2.309893 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.005131 -> Layer: exit2_layers.3.bias | Grad Mean: 0.012285 | Grad Max: 0.079053 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000017 | Grad Max: 0.000255 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002307 | Grad Max: 0.006255 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000174 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000549 | Grad Max: 0.002016 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000349 | Grad Max: 0.001472 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007047 | Grad Max: 0.007047 [GRADIENT NORM TOTAL] 5.4298 [EPOCH SUMMARY] Train Loss: 0.8079 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8018 | Alpha: 0.5500 No improve count: 1/15 ############################## EPOCH 175/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.315 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007629 0.4992371] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.088 [MASKS] A(Pass/Fail): 726/1322 | B: 673/1375 | C: 667/1381 [LOSS Ex1] A: 0.63191 | B: 0.60565 | C: 0.59735 [LOGITS Ex2 A] Mean Abs: 2.307 | Max: 7.475 [LOSS Ex2] A: 0.09323 | B: 0.31544 | C: 0.17952 ** [JOINT LOSS] ** : 0.807700 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009838 | Grad Max: 0.304862 -> Layer: shared_layers.0.bias | Grad Mean: 0.853097 | Grad Max: 4.089119 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002083 | Grad Max: 0.005454 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000121 | Grad Max: 0.000121 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005663 | Grad Max: 1.105497 -> Layer: exit2_layers.0.bias | Grad Mean: 0.104019 | Grad Max: 6.114343 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000545 | Grad Max: 0.015991 -> Layer: exit2_layers.3.bias | Grad Mean: 0.055722 | Grad Max: 0.276606 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000811 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010920 | Grad Max: 0.022201 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000553 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002555 | Grad Max: 0.008574 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001114 | Grad Max: 0.003386 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033724 | Grad Max: 0.033724 [GRADIENT NORM TOTAL] 19.4519 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 0.926 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7522561 0.2477439] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.089 [MASKS] A(Pass/Fail): 713/1335 | B: 656/1392 | C: 628/1420 [LOSS Ex1] A: 0.62684 | B: 0.60592 | C: 0.60216 [LOGITS Ex2 A] Mean Abs: 2.279 | Max: 7.330 [LOSS Ex2] A: 0.11484 | B: 0.31985 | C: 0.20920 ** [JOINT LOSS] ** : 0.826265 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.013621 | Grad Max: 0.348307 -> Layer: shared_layers.0.bias | Grad Mean: 1.045884 | Grad Max: 4.595587 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002061 | Grad Max: 0.005029 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005893 | Grad Max: 0.005893 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006829 | Grad Max: 0.947800 -> Layer: exit2_layers.0.bias | Grad Mean: 0.126474 | Grad Max: 5.329916 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000698 | Grad Max: 0.019894 -> Layer: exit2_layers.3.bias | Grad Mean: 0.070692 | Grad Max: 0.334431 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.000980 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013718 | Grad Max: 0.027364 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000760 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003244 | Grad Max: 0.010822 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001470 | Grad Max: 0.003497 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044110 | Grad Max: 0.044110 [GRADIENT NORM TOTAL] 22.3725 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.077 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6469529 0.3530471] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.089 [MASKS] A(Pass/Fail): 590/1026 | B: 672/1376 | C: 654/1394 [LOSS Ex1] A: 0.62493 | B: 0.60139 | C: 0.60263 [LOGITS Ex2 A] Mean Abs: 2.363 | Max: 10.313 [LOSS Ex2] A: 0.09980 | B: 0.28422 | C: 0.20562 ** [JOINT LOSS] ** : 0.806199 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007438 | Grad Max: 0.222790 -> Layer: shared_layers.0.bias | Grad Mean: 0.456270 | Grad Max: 2.018845 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.005484 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002813 | Grad Max: 0.002813 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003185 | Grad Max: 0.734236 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057863 | Grad Max: 4.106436 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000310 | Grad Max: 0.008420 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030703 | Grad Max: 0.146659 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000590 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006191 | Grad Max: 0.014592 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000341 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001476 | Grad Max: 0.005177 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000724 | Grad Max: 0.002389 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020965 | Grad Max: 0.020965 [GRADIENT NORM TOTAL] 10.9506 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.316 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.508341 0.49165902] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 728/1320 | B: 625/1231 | C: 653/1395 [LOSS Ex1] A: 0.62570 | B: 0.60538 | C: 0.59614 [LOGITS Ex2 A] Mean Abs: 2.395 | Max: 10.314 [LOSS Ex2] A: 0.08836 | B: 0.28792 | C: 0.19868 ** [JOINT LOSS] ** : 0.800728 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004308 | Grad Max: 0.217466 -> Layer: shared_layers.0.bias | Grad Mean: 0.575676 | Grad Max: 2.885713 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.005403 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000993 | Grad Max: 0.000993 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003633 | Grad Max: 0.848790 -> Layer: exit2_layers.0.bias | Grad Mean: 0.067764 | Grad Max: 4.728732 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000351 | Grad Max: 0.012350 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036364 | Grad Max: 0.204932 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000464 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006786 | Grad Max: 0.013748 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000330 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001645 | Grad Max: 0.005120 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000686 | Grad Max: 0.002456 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022045 | Grad Max: 0.022045 [GRADIENT NORM TOTAL] 13.3760 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.233 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50447804 0.49552193] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 721/1327 | B: 673/1375 | C: 708/1340 [LOSS Ex1] A: 0.62235 | B: 0.60555 | C: 0.59349 [LOGITS Ex2 A] Mean Abs: 2.415 | Max: 6.994 [LOSS Ex2] A: 0.09778 | B: 0.32269 | C: 0.21669 ** [JOINT LOSS] ** : 0.819519 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009976 | Grad Max: 0.448461 -> Layer: shared_layers.0.bias | Grad Mean: 1.145258 | Grad Max: 6.090080 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002189 | Grad Max: 0.006600 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001861 | Grad Max: 0.001861 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007069 | Grad Max: 1.485500 -> Layer: exit2_layers.0.bias | Grad Mean: 0.132417 | Grad Max: 8.285478 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000707 | Grad Max: 0.022706 -> Layer: exit2_layers.3.bias | Grad Mean: 0.073498 | Grad Max: 0.389052 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000094 | Grad Max: 0.000892 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013899 | Grad Max: 0.027694 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000698 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003268 | Grad Max: 0.010230 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001386 | Grad Max: 0.003592 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043008 | Grad Max: 0.043008 [GRADIENT NORM TOTAL] 25.6328 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.271 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51150876 0.48849118] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.089 [MASKS] A(Pass/Fail): 725/1323 | B: 658/1390 | C: 647/1401 [LOSS Ex1] A: 0.62129 | B: 0.60582 | C: 0.59904 [LOGITS Ex2 A] Mean Abs: 2.382 | Max: 6.905 [LOSS Ex2] A: 0.10617 | B: 0.31413 | C: 0.20649 ** [JOINT LOSS] ** : 0.817641 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007816 | Grad Max: 0.298697 -> Layer: shared_layers.0.bias | Grad Mean: 0.862895 | Grad Max: 4.091724 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002161 | Grad Max: 0.005778 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002669 | Grad Max: 0.002669 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005638 | Grad Max: 1.173109 -> Layer: exit2_layers.0.bias | Grad Mean: 0.104786 | Grad Max: 6.538060 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000556 | Grad Max: 0.017495 -> Layer: exit2_layers.3.bias | Grad Mean: 0.057382 | Grad Max: 0.278860 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000771 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010912 | Grad Max: 0.023293 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000515 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002558 | Grad Max: 0.007922 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001059 | Grad Max: 0.002937 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033541 | Grad Max: 0.033541 [GRADIENT NORM TOTAL] 19.5200 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.052 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004113 0.49958876] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.088 [MASKS] A(Pass/Fail): 691/1357 | B: 672/1376 | C: 669/1379 [LOSS Ex1] A: 0.63034 | B: 0.60130 | C: 0.60005 [LOGITS Ex2 A] Mean Abs: 2.289 | Max: 5.849 [LOSS Ex2] A: 0.09095 | B: 0.27320 | C: 0.21106 ** [JOINT LOSS] ** : 0.802298 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002876 | Grad Max: 0.098796 -> Layer: shared_layers.0.bias | Grad Mean: 0.242382 | Grad Max: 1.345094 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.005782 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008534 | Grad Max: 0.008534 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001475 | Grad Max: 0.552085 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026587 | Grad Max: 3.097513 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000127 | Grad Max: 0.004394 -> Layer: exit2_layers.3.bias | Grad Mean: 0.013091 | Grad Max: 0.078090 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000016 | Grad Max: 0.000259 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002478 | Grad Max: 0.006691 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000171 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000603 | Grad Max: 0.002282 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000358 | Grad Max: 0.001650 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009028 | Grad Max: 0.009028 [GRADIENT NORM TOTAL] 6.0961 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.943 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.541706 0.45829394] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 692/1356 | B: 625/1231 | C: 636/1412 [LOSS Ex1] A: 0.63114 | B: 0.60529 | C: 0.59573 [LOGITS Ex2 A] Mean Abs: 2.267 | Max: 6.210 [LOSS Ex2] A: 0.10063 | B: 0.28369 | C: 0.20835 ** [JOINT LOSS] ** : 0.808276 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004462 | Grad Max: 0.173319 -> Layer: shared_layers.0.bias | Grad Mean: 0.500568 | Grad Max: 2.356085 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005284 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003399 | Grad Max: 0.003399 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003197 | Grad Max: 0.454253 -> Layer: exit2_layers.0.bias | Grad Mean: 0.059481 | Grad Max: 2.511084 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000342 | Grad Max: 0.012377 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035200 | Grad Max: 0.195682 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000501 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006628 | Grad Max: 0.014343 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000383 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001558 | Grad Max: 0.005838 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000696 | Grad Max: 0.002197 -> Layer: exit2_layers.12.bias | Grad Mean: 0.020642 | Grad Max: 0.020642 [GRADIENT NORM TOTAL] 10.6611 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.186 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8291448 0.17085524] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.089 [MASKS] A(Pass/Fail): 749/1299 | B: 675/1373 | C: 644/1404 [LOSS Ex1] A: 0.62445 | B: 0.60547 | C: 0.60157 [LOGITS Ex2 A] Mean Abs: 2.335 | Max: 7.758 [LOSS Ex2] A: 0.08712 | B: 0.29499 | C: 0.20315 ** [JOINT LOSS] ** : 0.805581 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004433 | Grad Max: 0.133531 -> Layer: shared_layers.0.bias | Grad Mean: 0.160500 | Grad Max: 0.802365 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005926 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000631 | Grad Max: 0.000631 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001338 | Grad Max: 0.206997 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023052 | Grad Max: 1.037719 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000090 | Grad Max: 0.005103 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007972 | Grad Max: 0.065151 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000147 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001188 | Grad Max: 0.004258 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000104 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000281 | Grad Max: 0.001374 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000305 | Grad Max: 0.001250 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004360 | Grad Max: 0.004360 [GRADIENT NORM TOTAL] 3.9758 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.318 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5007845 0.49921548] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 726/1322 | B: 663/1385 | C: 657/1391 [LOSS Ex1] A: 0.63177 | B: 0.60572 | C: 0.60088 [LOGITS Ex2 A] Mean Abs: 2.402 | Max: 7.106 [LOSS Ex2] A: 0.08300 | B: 0.31088 | C: 0.20404 ** [JOINT LOSS] ** : 0.812101 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005714 | Grad Max: 0.303205 -> Layer: shared_layers.0.bias | Grad Mean: 0.768485 | Grad Max: 4.102230 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.004893 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002099 | Grad Max: 0.002099 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005069 | Grad Max: 0.783775 -> Layer: exit2_layers.0.bias | Grad Mean: 0.094804 | Grad Max: 4.331377 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000474 | Grad Max: 0.016212 -> Layer: exit2_layers.3.bias | Grad Mean: 0.049215 | Grad Max: 0.265388 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000647 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009283 | Grad Max: 0.018502 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000484 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002207 | Grad Max: 0.007297 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000873 | Grad Max: 0.002582 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028054 | Grad Max: 0.028054 [GRADIENT NORM TOTAL] 17.8014 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 0.928 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7527278 0.24727216] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.089 [MASKS] A(Pass/Fail): 713/1335 | B: 673/1375 | C: 685/1363 [LOSS Ex1] A: 0.62670 | B: 0.60120 | C: 0.59940 [LOGITS Ex2 A] Mean Abs: 2.372 | Max: 6.716 [LOSS Ex2] A: 0.10771 | B: 0.29012 | C: 0.22309 ** [JOINT LOSS] ** : 0.816071 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004463 | Grad Max: 0.318381 -> Layer: shared_layers.0.bias | Grad Mean: 0.752907 | Grad Max: 4.323667 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.005556 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005399 | Grad Max: 0.005399 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004883 | Grad Max: 0.758179 -> Layer: exit2_layers.0.bias | Grad Mean: 0.090849 | Grad Max: 4.211820 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000436 | Grad Max: 0.015832 -> Layer: exit2_layers.3.bias | Grad Mean: 0.045854 | Grad Max: 0.248037 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000571 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008434 | Grad Max: 0.017280 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000457 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001968 | Grad Max: 0.006956 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000791 | Grad Max: 0.002265 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025672 | Grad Max: 0.025672 [GRADIENT NORM TOTAL] 17.7795 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.080 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64723617 0.35276377] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 590/1026 | B: 625/1231 | C: 680/1368 [LOSS Ex1] A: 0.62479 | B: 0.60519 | C: 0.60800 [LOGITS Ex2 A] Mean Abs: 2.380 | Max: 8.272 [LOSS Ex2] A: 0.09819 | B: 0.27895 | C: 0.22537 ** [JOINT LOSS] ** : 0.813497 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004712 | Grad Max: 0.135212 -> Layer: shared_layers.0.bias | Grad Mean: 0.206931 | Grad Max: 1.157732 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002043 | Grad Max: 0.006074 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001905 | Grad Max: 0.001905 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001592 | Grad Max: 0.254734 -> Layer: exit2_layers.0.bias | Grad Mean: 0.027336 | Grad Max: 1.357807 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000107 | Grad Max: 0.004764 -> Layer: exit2_layers.3.bias | Grad Mean: 0.009761 | Grad Max: 0.071782 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000011 | Grad Max: 0.000190 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001647 | Grad Max: 0.005007 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000130 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000386 | Grad Max: 0.001617 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000267 | Grad Max: 0.000986 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004100 | Grad Max: 0.004100 [GRADIENT NORM TOTAL] 5.1113 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.319 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083452 0.49165478] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 728/1320 | B: 675/1373 | C: 641/1407 [LOSS Ex1] A: 0.62556 | B: 0.60537 | C: 0.60209 [LOGITS Ex2 A] Mean Abs: 2.322 | Max: 10.590 [LOSS Ex2] A: 0.08958 | B: 0.31013 | C: 0.20957 ** [JOINT LOSS] ** : 0.814102 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012020 | Grad Max: 0.307126 -> Layer: shared_layers.0.bias | Grad Mean: 0.838868 | Grad Max: 4.057852 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.005446 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005408 | Grad Max: 0.005408 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005314 | Grad Max: 0.919225 -> Layer: exit2_layers.0.bias | Grad Mean: 0.097611 | Grad Max: 5.102306 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000530 | Grad Max: 0.015335 -> Layer: exit2_layers.3.bias | Grad Mean: 0.053822 | Grad Max: 0.260565 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000739 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010674 | Grad Max: 0.021618 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000581 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002579 | Grad Max: 0.008680 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001188 | Grad Max: 0.003475 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035530 | Grad Max: 0.035530 [GRADIENT NORM TOTAL] 17.9125 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.236 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5044997 0.49550033] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 721/1327 | B: 663/1385 | C: 394/982 [LOSS Ex1] A: 0.62222 | B: 0.60562 | C: 0.61132 [LOGITS Ex2 A] Mean Abs: 2.302 | Max: 7.393 [LOSS Ex2] A: 0.09680 | B: 0.31810 | C: 0.23263 ** [JOINT LOSS] ** : 0.828895 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009771 | Grad Max: 0.330906 -> Layer: shared_layers.0.bias | Grad Mean: 0.908256 | Grad Max: 4.276681 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.005902 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002440 | Grad Max: 0.002440 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005765 | Grad Max: 1.013015 -> Layer: exit2_layers.0.bias | Grad Mean: 0.107783 | Grad Max: 5.606791 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000557 | Grad Max: 0.017812 -> Layer: exit2_layers.3.bias | Grad Mean: 0.057844 | Grad Max: 0.307251 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000733 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011147 | Grad Max: 0.022126 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000604 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002695 | Grad Max: 0.008630 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001188 | Grad Max: 0.003070 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037001 | Grad Max: 0.037001 [GRADIENT NORM TOTAL] 20.3421 [EPOCH SUMMARY] Train Loss: 0.8128 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.7921 | Alpha: 0.5500 No improve count: 2/15 ############################## EPOCH 176/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.152 | Max: 1.274 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51155525 0.48844478] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.089 [MASKS] A(Pass/Fail): 725/1323 | B: 673/1375 | C: 646/1402 [LOSS Ex1] A: 0.62115 | B: 0.60110 | C: 0.60301 [LOGITS Ex2 A] Mean Abs: 2.295 | Max: 7.810 [LOSS Ex2] A: 0.10192 | B: 0.27232 | C: 0.19584 ** [JOINT LOSS] ** : 0.798452 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004767 | Grad Max: 0.169517 -> Layer: shared_layers.0.bias | Grad Mean: 0.406693 | Grad Max: 2.326998 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002160 | Grad Max: 0.005832 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003771 | Grad Max: 0.003771 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002252 | Grad Max: 0.734370 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041024 | Grad Max: 4.045258 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000183 | Grad Max: 0.006965 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018633 | Grad Max: 0.104784 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000024 | Grad Max: 0.000312 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003602 | Grad Max: 0.008417 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000221 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000871 | Grad Max: 0.003153 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000417 | Grad Max: 0.002012 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012088 | Grad Max: 0.012088 [GRADIENT NORM TOTAL] 9.4626 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.055 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50038296 0.49961704] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.564 | Std: 0.088 [MASKS] A(Pass/Fail): 690/1358 | B: 625/1231 | C: 645/1403 [LOSS Ex1] A: 0.63021 | B: 0.60508 | C: 0.59613 [LOGITS Ex2 A] Mean Abs: 2.328 | Max: 5.551 [LOSS Ex2] A: 0.09202 | B: 0.28117 | C: 0.22214 ** [JOINT LOSS] ** : 0.808918 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006454 | Grad Max: 0.189014 -> Layer: shared_layers.0.bias | Grad Mean: 0.556587 | Grad Max: 2.473382 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002109 | Grad Max: 0.005265 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004256 | Grad Max: 0.004256 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003601 | Grad Max: 0.550460 -> Layer: exit2_layers.0.bias | Grad Mean: 0.066531 | Grad Max: 3.027926 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000383 | Grad Max: 0.011153 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039469 | Grad Max: 0.204650 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000578 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007654 | Grad Max: 0.016003 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000400 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001843 | Grad Max: 0.005908 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000748 | Grad Max: 0.002410 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024358 | Grad Max: 0.024358 [GRADIENT NORM TOTAL] 11.8718 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.946 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5416846 0.4583154] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 692/1356 | B: 675/1373 | C: 640/1408 [LOSS Ex1] A: 0.63102 | B: 0.60527 | C: 0.59732 [LOGITS Ex2 A] Mean Abs: 2.314 | Max: 6.729 [LOSS Ex2] A: 0.10474 | B: 0.30602 | C: 0.18603 ** [JOINT LOSS] ** : 0.810131 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008293 | Grad Max: 0.235354 -> Layer: shared_layers.0.bias | Grad Mean: 0.698026 | Grad Max: 3.066502 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.005526 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008902 | Grad Max: 0.008902 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004475 | Grad Max: 0.654067 -> Layer: exit2_layers.0.bias | Grad Mean: 0.083127 | Grad Max: 3.637976 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000478 | Grad Max: 0.014288 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048956 | Grad Max: 0.244774 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000661 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009489 | Grad Max: 0.019240 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000495 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002281 | Grad Max: 0.007255 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000982 | Grad Max: 0.002913 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030466 | Grad Max: 0.030466 [GRADIENT NORM TOTAL] 14.6849 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.189 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8297274 0.17027256] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 749/1299 | B: 664/1384 | C: 673/1375 [LOSS Ex1] A: 0.62432 | B: 0.60553 | C: 0.59827 [LOGITS Ex2 A] Mean Abs: 2.323 | Max: 8.646 [LOSS Ex2] A: 0.09756 | B: 0.28975 | C: 0.21118 ** [JOINT LOSS] ** : 0.808868 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004091 | Grad Max: 0.135214 -> Layer: shared_layers.0.bias | Grad Mean: 0.143170 | Grad Max: 0.604071 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.005584 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004638 | Grad Max: 0.004638 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001168 | Grad Max: 0.432684 -> Layer: exit2_layers.0.bias | Grad Mean: 0.020419 | Grad Max: 2.347293 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000086 | Grad Max: 0.003359 -> Layer: exit2_layers.3.bias | Grad Mean: 0.007826 | Grad Max: 0.042020 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000013 | Grad Max: 0.000230 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001601 | Grad Max: 0.004884 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000125 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000368 | Grad Max: 0.001374 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000333 | Grad Max: 0.001173 -> Layer: exit2_layers.12.bias | Grad Mean: 0.004160 | Grad Max: 0.004160 [GRADIENT NORM TOTAL] 4.3147 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.321 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50077283 0.49922717] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 726/1322 | B: 673/1375 | C: 655/1393 [LOSS Ex1] A: 0.63166 | B: 0.60100 | C: 0.59768 [LOGITS Ex2 A] Mean Abs: 2.281 | Max: 6.477 [LOSS Ex2] A: 0.09097 | B: 0.28981 | C: 0.20820 ** [JOINT LOSS] ** : 0.806440 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009641 | Grad Max: 0.295676 -> Layer: shared_layers.0.bias | Grad Mean: 0.839323 | Grad Max: 3.761452 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002076 | Grad Max: 0.005133 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003532 | Grad Max: 0.003532 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005253 | Grad Max: 0.668250 -> Layer: exit2_layers.0.bias | Grad Mean: 0.097118 | Grad Max: 3.666086 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.018441 -> Layer: exit2_layers.3.bias | Grad Mean: 0.055272 | Grad Max: 0.298657 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000727 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010780 | Grad Max: 0.021595 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000597 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002610 | Grad Max: 0.009134 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001149 | Grad Max: 0.003227 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035172 | Grad Max: 0.035172 [GRADIENT NORM TOTAL] 17.4550 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.931 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75322014 0.2467799 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.566 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 625/1231 | C: 678/1370 [LOSS Ex1] A: 0.62658 | B: 0.60497 | C: 0.60087 [LOGITS Ex2 A] Mean Abs: 2.258 | Max: 8.097 [LOSS Ex2] A: 0.10902 | B: 0.31417 | C: 0.22213 ** [JOINT LOSS] ** : 0.825911 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010580 | Grad Max: 0.312053 -> Layer: shared_layers.0.bias | Grad Mean: 0.922236 | Grad Max: 4.068727 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002090 | Grad Max: 0.005222 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002779 | Grad Max: 0.002779 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005780 | Grad Max: 0.726538 -> Layer: exit2_layers.0.bias | Grad Mean: 0.107350 | Grad Max: 4.019336 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000620 | Grad Max: 0.017326 -> Layer: exit2_layers.3.bias | Grad Mean: 0.063954 | Grad Max: 0.309973 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000832 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012350 | Grad Max: 0.024379 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000624 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002960 | Grad Max: 0.009674 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001289 | Grad Max: 0.003184 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039872 | Grad Max: 0.039872 [GRADIENT NORM TOTAL] 19.1998 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.084 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6474591 0.35254088] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 590/1026 | B: 675/1373 | C: 674/1374 [LOSS Ex1] A: 0.62467 | B: 0.60517 | C: 0.59681 [LOGITS Ex2 A] Mean Abs: 2.333 | Max: 11.805 [LOSS Ex2] A: 0.10076 | B: 0.30491 | C: 0.20244 ** [JOINT LOSS] ** : 0.811586 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006327 | Grad Max: 0.168280 -> Layer: shared_layers.0.bias | Grad Mean: 0.344519 | Grad Max: 1.465969 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.005915 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002386 | Grad Max: 0.002386 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002337 | Grad Max: 0.344608 -> Layer: exit2_layers.0.bias | Grad Mean: 0.041627 | Grad Max: 1.882188 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000245 | Grad Max: 0.007869 -> Layer: exit2_layers.3.bias | Grad Mean: 0.024560 | Grad Max: 0.132458 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000033 | Grad Max: 0.000431 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004974 | Grad Max: 0.010628 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000275 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001195 | Grad Max: 0.004066 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000557 | Grad Max: 0.001981 -> Layer: exit2_layers.12.bias | Grad Mean: 0.016539 | Grad Max: 0.016539 [GRADIENT NORM TOTAL] 7.2624 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.323 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083105 0.4916895] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 728/1320 | B: 664/1384 | C: 649/1399 [LOSS Ex1] A: 0.62544 | B: 0.60542 | C: 0.60062 [LOGITS Ex2 A] Mean Abs: 2.372 | Max: 9.375 [LOSS Ex2] A: 0.09218 | B: 0.32349 | C: 0.23122 ** [JOINT LOSS] ** : 0.826122 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007674 | Grad Max: 0.458716 -> Layer: shared_layers.0.bias | Grad Mean: 1.068816 | Grad Max: 6.224372 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005626 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007668 | Grad Max: 0.007668 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006581 | Grad Max: 1.381277 -> Layer: exit2_layers.0.bias | Grad Mean: 0.123063 | Grad Max: 7.675564 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000591 | Grad Max: 0.018532 -> Layer: exit2_layers.3.bias | Grad Mean: 0.062200 | Grad Max: 0.338443 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000809 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011810 | Grad Max: 0.023549 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000589 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002833 | Grad Max: 0.009431 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001139 | Grad Max: 0.003137 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036723 | Grad Max: 0.036723 [GRADIENT NORM TOTAL] 25.0561 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.240 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50446427 0.4955357 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 721/1327 | B: 673/1375 | C: 668/1380 [LOSS Ex1] A: 0.62209 | B: 0.60090 | C: 0.60719 [LOGITS Ex2 A] Mean Abs: 2.407 | Max: 7.554 [LOSS Ex2] A: 0.10652 | B: 0.33220 | C: 0.25132 ** [JOINT LOSS] ** : 0.840073 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.015401 | Grad Max: 0.679527 -> Layer: shared_layers.0.bias | Grad Mean: 1.738411 | Grad Max: 9.155875 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002178 | Grad Max: 0.005962 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006361 | Grad Max: 0.006361 -> Layer: exit2_layers.0.weight | Grad Mean: 0.010840 | Grad Max: 1.969803 -> Layer: exit2_layers.0.bias | Grad Mean: 0.202533 | Grad Max: 10.932016 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001033 | Grad Max: 0.030437 -> Layer: exit2_layers.3.bias | Grad Mean: 0.108418 | Grad Max: 0.533545 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000138 | Grad Max: 0.001288 -> Layer: exit2_layers.6.bias | Grad Mean: 0.020717 | Grad Max: 0.040233 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000053 | Grad Max: 0.000992 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004965 | Grad Max: 0.016357 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002068 | Grad Max: 0.004901 -> Layer: exit2_layers.12.bias | Grad Mean: 0.064879 | Grad Max: 0.064879 [GRADIENT NORM TOTAL] 39.0081 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.278 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5116212 0.48837882] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 725/1323 | B: 625/1231 | C: 649/1399 [LOSS Ex1] A: 0.62103 | B: 0.60487 | C: 0.60509 [LOGITS Ex2 A] Mean Abs: 2.369 | Max: 7.328 [LOSS Ex2] A: 0.11993 | B: 0.33389 | C: 0.23228 ** [JOINT LOSS] ** : 0.839034 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.015557 | Grad Max: 0.606003 -> Layer: shared_layers.0.bias | Grad Mean: 1.567606 | Grad Max: 8.038385 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.005943 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003477 | Grad Max: 0.003477 -> Layer: exit2_layers.0.weight | Grad Mean: 0.009911 | Grad Max: 1.763369 -> Layer: exit2_layers.0.bias | Grad Mean: 0.184381 | Grad Max: 9.800306 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000969 | Grad Max: 0.029891 -> Layer: exit2_layers.3.bias | Grad Mean: 0.101400 | Grad Max: 0.510848 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000131 | Grad Max: 0.001219 -> Layer: exit2_layers.6.bias | Grad Mean: 0.019654 | Grad Max: 0.038537 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000051 | Grad Max: 0.000888 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004764 | Grad Max: 0.014826 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002063 | Grad Max: 0.004841 -> Layer: exit2_layers.12.bias | Grad Mean: 0.063717 | Grad Max: 0.063717 [GRADIENT NORM TOTAL] 34.8360 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.146 | Max: 1.057 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003391 0.4996609] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.088 [MASKS] A(Pass/Fail): 690/1358 | B: 675/1373 | C: 682/1366 [LOSS Ex1] A: 0.63009 | B: 0.60508 | C: 0.59551 [LOGITS Ex2 A] Mean Abs: 2.302 | Max: 6.050 [LOSS Ex2] A: 0.09132 | B: 0.30923 | C: 0.20967 ** [JOINT LOSS] ** : 0.813633 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008985 | Grad Max: 0.320433 -> Layer: shared_layers.0.bias | Grad Mean: 0.742529 | Grad Max: 3.755771 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.005790 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001985 | Grad Max: 0.001985 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004664 | Grad Max: 0.733430 -> Layer: exit2_layers.0.bias | Grad Mean: 0.086578 | Grad Max: 4.065271 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000466 | Grad Max: 0.016186 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048268 | Grad Max: 0.259386 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000607 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009413 | Grad Max: 0.018363 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000399 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002313 | Grad Max: 0.006905 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000944 | Grad Max: 0.002976 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030664 | Grad Max: 0.030664 [GRADIENT NORM TOTAL] 16.2391 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.948 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54168504 0.45831496] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 693/1355 | B: 664/1384 | C: 657/1391 [LOSS Ex1] A: 0.63092 | B: 0.60534 | C: 0.59929 [LOGITS Ex2 A] Mean Abs: 2.234 | Max: 6.458 [LOSS Ex2] A: 0.09628 | B: 0.30228 | C: 0.20343 ** [JOINT LOSS] ** : 0.812509 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007075 | Grad Max: 0.237467 -> Layer: shared_layers.0.bias | Grad Mean: 0.620302 | Grad Max: 3.225569 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.005532 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009951 | Grad Max: 0.009951 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003888 | Grad Max: 0.925578 -> Layer: exit2_layers.0.bias | Grad Mean: 0.072216 | Grad Max: 5.112112 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000389 | Grad Max: 0.013230 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039991 | Grad Max: 0.218175 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000545 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007818 | Grad Max: 0.015970 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000454 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001906 | Grad Max: 0.006503 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000899 | Grad Max: 0.002905 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026795 | Grad Max: 0.026795 [GRADIENT NORM TOTAL] 14.0493 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.191 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8302773 0.1697227] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 749/1299 | B: 673/1375 | C: 616/1432 [LOSS Ex1] A: 0.62421 | B: 0.60082 | C: 0.60812 [LOGITS Ex2 A] Mean Abs: 2.236 | Max: 8.160 [LOSS Ex2] A: 0.09090 | B: 0.31453 | C: 0.23167 ** [JOINT LOSS] ** : 0.823417 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008648 | Grad Max: 0.374222 -> Layer: shared_layers.0.bias | Grad Mean: 1.014057 | Grad Max: 4.903749 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.005299 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001027 | Grad Max: 0.001027 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006684 | Grad Max: 1.138995 -> Layer: exit2_layers.0.bias | Grad Mean: 0.125916 | Grad Max: 6.286173 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000681 | Grad Max: 0.022003 -> Layer: exit2_layers.3.bias | Grad Mean: 0.071449 | Grad Max: 0.355191 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.000856 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013628 | Grad Max: 0.026281 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000649 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003303 | Grad Max: 0.010533 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001396 | Grad Max: 0.003381 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044123 | Grad Max: 0.044123 [GRADIENT NORM TOTAL] 22.9404 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.150 | Max: 1.325 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50077987 0.49922007] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 726/1322 | B: 625/1231 | C: 469/907 [LOSS Ex1] A: 0.63155 | B: 0.60480 | C: 0.59451 [LOGITS Ex2 A] Mean Abs: 2.260 | Max: 6.926 [LOSS Ex2] A: 0.08420 | B: 0.31117 | C: 0.20496 ** [JOINT LOSS] ** : 0.810398 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007051 | Grad Max: 0.302377 -> Layer: shared_layers.0.bias | Grad Mean: 0.825366 | Grad Max: 4.064525 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.006189 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001021 | Grad Max: 0.001021 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005577 | Grad Max: 0.947340 -> Layer: exit2_layers.0.bias | Grad Mean: 0.103348 | Grad Max: 5.248195 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000549 | Grad Max: 0.017476 -> Layer: exit2_layers.3.bias | Grad Mean: 0.057967 | Grad Max: 0.289239 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000730 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011255 | Grad Max: 0.023302 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000598 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002706 | Grad Max: 0.009348 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001183 | Grad Max: 0.003291 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036619 | Grad Max: 0.036619 [GRADIENT NORM TOTAL] 18.9271 [EPOCH SUMMARY] Train Loss: 0.8168 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.7917 | Alpha: 0.5500 No improve count: 3/15 ############################## EPOCH 177/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.933 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7535705 0.24642953] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 675/1373 | C: 646/1402 [LOSS Ex1] A: 0.62646 | B: 0.60501 | C: 0.60406 [LOGITS Ex2 A] Mean Abs: 2.271 | Max: 7.725 [LOSS Ex2] A: 0.10119 | B: 0.30364 | C: 0.20967 ** [JOINT LOSS] ** : 0.816677 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002601 | Grad Max: 0.064463 -> Layer: shared_layers.0.bias | Grad Mean: 0.176980 | Grad Max: 0.805885 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005192 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002578 | Grad Max: 0.002578 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001324 | Grad Max: 0.198173 -> Layer: exit2_layers.0.bias | Grad Mean: 0.023688 | Grad Max: 1.089600 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000114 | Grad Max: 0.004608 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011457 | Grad Max: 0.074510 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000014 | Grad Max: 0.000237 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002063 | Grad Max: 0.005910 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000006 | Grad Max: 0.000139 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000487 | Grad Max: 0.002017 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000306 | Grad Max: 0.001384 -> Layer: exit2_layers.12.bias | Grad Mean: 0.006703 | Grad Max: 0.006703 [GRADIENT NORM TOTAL] 4.2812 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.086 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64751923 0.3524808 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 590/1026 | B: 664/1384 | C: 648/1400 [LOSS Ex1] A: 0.62455 | B: 0.60526 | C: 0.60337 [LOGITS Ex2 A] Mean Abs: 2.390 | Max: 12.313 [LOSS Ex2] A: 0.10441 | B: 0.33511 | C: 0.20290 ** [JOINT LOSS] ** : 0.825201 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012743 | Grad Max: 0.503062 -> Layer: shared_layers.0.bias | Grad Mean: 1.257195 | Grad Max: 6.637047 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002084 | Grad Max: 0.005536 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007910 | Grad Max: 0.007910 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008072 | Grad Max: 1.273854 -> Layer: exit2_layers.0.bias | Grad Mean: 0.150060 | Grad Max: 7.126526 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000788 | Grad Max: 0.024811 -> Layer: exit2_layers.3.bias | Grad Mean: 0.082576 | Grad Max: 0.413432 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000107 | Grad Max: 0.001029 -> Layer: exit2_layers.6.bias | Grad Mean: 0.016124 | Grad Max: 0.031277 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000802 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003962 | Grad Max: 0.012663 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001647 | Grad Max: 0.004041 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052696 | Grad Max: 0.052696 [GRADIENT NORM TOTAL] 27.9295 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.326 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50835264 0.49164736] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 728/1320 | B: 673/1375 | C: 672/1376 [LOSS Ex1] A: 0.62532 | B: 0.60075 | C: 0.59849 [LOGITS Ex2 A] Mean Abs: 2.358 | Max: 9.378 [LOSS Ex2] A: 0.10273 | B: 0.35893 | C: 0.26500 ** [JOINT LOSS] ** : 0.850408 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.017955 | Grad Max: 0.706707 -> Layer: shared_layers.0.bias | Grad Mean: 1.797118 | Grad Max: 9.609874 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.005373 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000206 | Grad Max: 0.000206 -> Layer: exit2_layers.0.weight | Grad Mean: 0.011493 | Grad Max: 1.675225 -> Layer: exit2_layers.0.bias | Grad Mean: 0.213592 | Grad Max: 9.390087 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001136 | Grad Max: 0.034902 -> Layer: exit2_layers.3.bias | Grad Mean: 0.118895 | Grad Max: 0.602330 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000154 | Grad Max: 0.001463 -> Layer: exit2_layers.6.bias | Grad Mean: 0.023062 | Grad Max: 0.045407 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000060 | Grad Max: 0.001151 -> Layer: exit2_layers.9.bias | Grad Mean: 0.005587 | Grad Max: 0.018666 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002347 | Grad Max: 0.005865 -> Layer: exit2_layers.12.bias | Grad Mean: 0.072860 | Grad Max: 0.072860 [GRADIENT NORM TOTAL] 39.7501 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.242 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50437355 0.49562642] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 721/1327 | B: 625/1231 | C: 650/1398 [LOSS Ex1] A: 0.62198 | B: 0.60473 | C: 0.60203 [LOGITS Ex2 A] Mean Abs: 2.364 | Max: 6.491 [LOSS Ex2] A: 0.10376 | B: 0.34067 | C: 0.25229 ** [JOINT LOSS] ** : 0.841820 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014724 | Grad Max: 0.536502 -> Layer: shared_layers.0.bias | Grad Mean: 1.455223 | Grad Max: 7.018998 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002288 | Grad Max: 0.006578 -> Layer: exit1_layers.0.bias | Grad Mean: 0.013934 | Grad Max: 0.013934 -> Layer: exit2_layers.0.weight | Grad Mean: 0.009392 | Grad Max: 1.385051 -> Layer: exit2_layers.0.bias | Grad Mean: 0.174771 | Grad Max: 7.653933 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000974 | Grad Max: 0.030309 -> Layer: exit2_layers.3.bias | Grad Mean: 0.102527 | Grad Max: 0.530467 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000132 | Grad Max: 0.001242 -> Layer: exit2_layers.6.bias | Grad Mean: 0.019920 | Grad Max: 0.038326 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.001073 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004933 | Grad Max: 0.016441 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002082 | Grad Max: 0.005174 -> Layer: exit2_layers.12.bias | Grad Mean: 0.066274 | Grad Max: 0.066274 [GRADIENT NORM TOTAL] 31.6905 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.280 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51172817 0.4882719 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 725/1323 | B: 675/1373 | C: 630/1418 [LOSS Ex1] A: 0.62092 | B: 0.60495 | C: 0.60300 [LOGITS Ex2 A] Mean Abs: 2.285 | Max: 6.376 [LOSS Ex2] A: 0.10337 | B: 0.30643 | C: 0.20875 ** [JOINT LOSS] ** : 0.815806 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007675 | Grad Max: 0.216533 -> Layer: shared_layers.0.bias | Grad Mean: 0.655335 | Grad Max: 2.921336 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002133 | Grad Max: 0.006151 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000210 | Grad Max: 0.000210 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004349 | Grad Max: 0.566335 -> Layer: exit2_layers.0.bias | Grad Mean: 0.080491 | Grad Max: 3.160355 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000468 | Grad Max: 0.015867 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048683 | Grad Max: 0.247603 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000629 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009551 | Grad Max: 0.018669 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000484 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002346 | Grad Max: 0.007862 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000980 | Grad Max: 0.003036 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031040 | Grad Max: 0.031040 [GRADIENT NORM TOTAL] 13.8384 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.060 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50036466 0.49963534] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.088 [MASKS] A(Pass/Fail): 690/1358 | B: 664/1384 | C: 676/1372 [LOSS Ex1] A: 0.62998 | B: 0.60520 | C: 0.59504 [LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.286 [LOSS Ex2] A: 0.09770 | B: 0.31739 | C: 0.21319 ** [JOINT LOSS] ** : 0.819501 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008084 | Grad Max: 0.291463 -> Layer: shared_layers.0.bias | Grad Mean: 0.802231 | Grad Max: 3.723869 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005389 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003741 | Grad Max: 0.003741 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004892 | Grad Max: 0.906629 -> Layer: exit2_layers.0.bias | Grad Mean: 0.090887 | Grad Max: 5.089719 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000497 | Grad Max: 0.016187 -> Layer: exit2_layers.3.bias | Grad Mean: 0.052098 | Grad Max: 0.266044 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000661 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010111 | Grad Max: 0.020397 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000510 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002470 | Grad Max: 0.008187 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001052 | Grad Max: 0.003059 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032957 | Grad Max: 0.032957 [GRADIENT NORM TOTAL] 17.7342 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.949 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.541623 0.45837697] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 693/1355 | B: 673/1375 | C: 673/1375 [LOSS Ex1] A: 0.63081 | B: 0.60069 | C: 0.59743 [LOGITS Ex2 A] Mean Abs: 2.106 | Max: 6.179 [LOSS Ex2] A: 0.12230 | B: 0.33913 | C: 0.21808 ** [JOINT LOSS] ** : 0.836146 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.016352 | Grad Max: 0.439893 -> Layer: shared_layers.0.bias | Grad Mean: 1.348686 | Grad Max: 5.885354 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002090 | Grad Max: 0.005254 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007072 | Grad Max: 0.007072 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008465 | Grad Max: 1.337051 -> Layer: exit2_layers.0.bias | Grad Mean: 0.157538 | Grad Max: 7.400558 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000892 | Grad Max: 0.026617 -> Layer: exit2_layers.3.bias | Grad Mean: 0.092705 | Grad Max: 0.470533 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000122 | Grad Max: 0.001193 -> Layer: exit2_layers.6.bias | Grad Mean: 0.018107 | Grad Max: 0.035043 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000048 | Grad Max: 0.000937 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004412 | Grad Max: 0.014248 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001891 | Grad Max: 0.004322 -> Layer: exit2_layers.12.bias | Grad Mean: 0.058377 | Grad Max: 0.058377 [GRADIENT NORM TOTAL] 28.8972 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.194 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8306088 0.16939126] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 747/1301 | B: 625/1231 | C: 674/1374 [LOSS Ex1] A: 0.62411 | B: 0.60466 | C: 0.59541 [LOGITS Ex2 A] Mean Abs: 2.160 | Max: 7.427 [LOSS Ex2] A: 0.09818 | B: 0.34529 | C: 0.22850 ** [JOINT LOSS] ** : 0.832046 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014555 | Grad Max: 0.467094 -> Layer: shared_layers.0.bias | Grad Mean: 1.286538 | Grad Max: 6.225985 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002102 | Grad Max: 0.006022 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003153 | Grad Max: 0.003153 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007957 | Grad Max: 1.152430 -> Layer: exit2_layers.0.bias | Grad Mean: 0.148720 | Grad Max: 6.380592 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000872 | Grad Max: 0.027025 -> Layer: exit2_layers.3.bias | Grad Mean: 0.091388 | Grad Max: 0.475451 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000119 | Grad Max: 0.001127 -> Layer: exit2_layers.6.bias | Grad Mean: 0.017816 | Grad Max: 0.034294 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000917 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004398 | Grad Max: 0.014211 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001872 | Grad Max: 0.004238 -> Layer: exit2_layers.12.bias | Grad Mean: 0.058969 | Grad Max: 0.058969 [GRADIENT NORM TOTAL] 27.1080 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.327 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50079536 0.49920467] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 726/1322 | B: 675/1373 | C: 682/1366 [LOSS Ex1] A: 0.63145 | B: 0.60488 | C: 0.59898 [LOGITS Ex2 A] Mean Abs: 2.221 | Max: 6.506 [LOSS Ex2] A: 0.09114 | B: 0.32030 | C: 0.19951 ** [JOINT LOSS] ** : 0.815419 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009127 | Grad Max: 0.268276 -> Layer: shared_layers.0.bias | Grad Mean: 0.717261 | Grad Max: 3.552707 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005213 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001095 | Grad Max: 0.001095 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004522 | Grad Max: 0.792040 -> Layer: exit2_layers.0.bias | Grad Mean: 0.082774 | Grad Max: 4.456845 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000471 | Grad Max: 0.013817 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048958 | Grad Max: 0.243816 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000679 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009757 | Grad Max: 0.020002 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000498 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002403 | Grad Max: 0.007997 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001063 | Grad Max: 0.003052 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032723 | Grad Max: 0.032723 [GRADIENT NORM TOTAL] 15.7850 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.934 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7537448 0.2462552] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 664/1384 | C: 686/1362 [LOSS Ex1] A: 0.62637 | B: 0.60514 | C: 0.60089 [LOGITS Ex2 A] Mean Abs: 2.244 | Max: 7.691 [LOSS Ex2] A: 0.10820 | B: 0.29639 | C: 0.20988 ** [JOINT LOSS] ** : 0.815625 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005146 | Grad Max: 0.176535 -> Layer: shared_layers.0.bias | Grad Mean: 0.594779 | Grad Max: 2.386675 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002040 | Grad Max: 0.005605 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003957 | Grad Max: 0.003957 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004035 | Grad Max: 0.687226 -> Layer: exit2_layers.0.bias | Grad Mean: 0.075033 | Grad Max: 3.828740 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000392 | Grad Max: 0.012511 -> Layer: exit2_layers.3.bias | Grad Mean: 0.041745 | Grad Max: 0.212567 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000508 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008134 | Grad Max: 0.015973 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000427 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002015 | Grad Max: 0.006621 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000797 | Grad Max: 0.002772 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026304 | Grad Max: 0.026304 [GRADIENT NORM TOTAL] 13.7124 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.088 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.647562 0.35243797] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 590/1026 | B: 673/1375 | C: 681/1367 [LOSS Ex1] A: 0.62447 | B: 0.60063 | C: 0.59774 [LOGITS Ex2 A] Mean Abs: 2.338 | Max: 8.876 [LOSS Ex2] A: 0.09456 | B: 0.31315 | C: 0.21942 ** [JOINT LOSS] ** : 0.816653 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009883 | Grad Max: 0.392674 -> Layer: shared_layers.0.bias | Grad Mean: 1.069519 | Grad Max: 5.330593 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002239 | Grad Max: 0.006339 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011851 | Grad Max: 0.011851 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006987 | Grad Max: 1.035984 -> Layer: exit2_layers.0.bias | Grad Mean: 0.130419 | Grad Max: 5.736405 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000676 | Grad Max: 0.021846 -> Layer: exit2_layers.3.bias | Grad Mean: 0.071771 | Grad Max: 0.372744 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.000867 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013943 | Grad Max: 0.027202 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000718 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003435 | Grad Max: 0.011170 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001384 | Grad Max: 0.003643 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044827 | Grad Max: 0.044827 [GRADIENT NORM TOTAL] 24.1507 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.328 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50833124 0.49166873] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 728/1320 | B: 625/1231 | C: 664/1384 [LOSS Ex1] A: 0.62524 | B: 0.60460 | C: 0.60058 [LOGITS Ex2 A] Mean Abs: 2.304 | Max: 7.834 [LOSS Ex2] A: 0.09412 | B: 0.29890 | C: 0.21822 ** [JOINT LOSS] ** : 0.813886 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009598 | Grad Max: 0.315105 -> Layer: shared_layers.0.bias | Grad Mean: 0.906907 | Grad Max: 4.300270 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005409 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000351 | Grad Max: 0.000351 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005974 | Grad Max: 0.910318 -> Layer: exit2_layers.0.bias | Grad Mean: 0.110863 | Grad Max: 5.112164 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000586 | Grad Max: 0.018105 -> Layer: exit2_layers.3.bias | Grad Mean: 0.061726 | Grad Max: 0.316824 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000760 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012165 | Grad Max: 0.023673 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000603 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003045 | Grad Max: 0.009350 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001244 | Grad Max: 0.003536 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040411 | Grad Max: 0.040411 [GRADIENT NORM TOTAL] 20.4554 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.244 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5043426 0.49565738] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 721/1327 | B: 675/1373 | C: 655/1393 [LOSS Ex1] A: 0.62191 | B: 0.60483 | C: 0.59904 [LOGITS Ex2 A] Mean Abs: 2.265 | Max: 6.588 [LOSS Ex2] A: 0.10000 | B: 0.30126 | C: 0.20064 ** [JOINT LOSS] ** : 0.809223 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004032 | Grad Max: 0.117286 -> Layer: shared_layers.0.bias | Grad Mean: 0.216186 | Grad Max: 1.168710 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002167 | Grad Max: 0.005912 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000665 | Grad Max: 0.000665 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001446 | Grad Max: 0.337230 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026813 | Grad Max: 1.869007 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000152 | Grad Max: 0.005409 -> Layer: exit2_layers.3.bias | Grad Mean: 0.015405 | Grad Max: 0.081479 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000022 | Grad Max: 0.000270 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003002 | Grad Max: 0.006734 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000009 | Grad Max: 0.000173 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000734 | Grad Max: 0.002558 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000389 | Grad Max: 0.001702 -> Layer: exit2_layers.12.bias | Grad Mean: 0.009713 | Grad Max: 0.009713 [GRADIENT NORM TOTAL] 5.0620 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.282 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5117374 0.4882626] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 725/1323 | B: 664/1384 | C: 452/924 [LOSS Ex1] A: 0.62085 | B: 0.60508 | C: 0.60435 [LOGITS Ex2 A] Mean Abs: 2.127 | Max: 6.700 [LOSS Ex2] A: 0.10709 | B: 0.34018 | C: 0.23034 ** [JOINT LOSS] ** : 0.835964 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010595 | Grad Max: 0.382774 -> Layer: shared_layers.0.bias | Grad Mean: 1.031270 | Grad Max: 4.879426 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002091 | Grad Max: 0.005807 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010050 | Grad Max: 0.010050 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006544 | Grad Max: 0.785962 -> Layer: exit2_layers.0.bias | Grad Mean: 0.122486 | Grad Max: 4.322562 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000660 | Grad Max: 0.019084 -> Layer: exit2_layers.3.bias | Grad Mean: 0.070025 | Grad Max: 0.343889 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.000862 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013667 | Grad Max: 0.026247 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000656 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003428 | Grad Max: 0.010455 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001500 | Grad Max: 0.003512 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047080 | Grad Max: 0.047080 [GRADIENT NORM TOTAL] 22.1416 [EPOCH SUMMARY] Train Loss: 0.8246 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8318 | Alpha: 0.5500 No improve count: 4/15 ############################## EPOCH 178/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.061 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50030965 0.49969038] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.088 [MASKS] A(Pass/Fail): 690/1358 | B: 673/1375 | C: 686/1362 [LOSS Ex1] A: 0.62990 | B: 0.60057 | C: 0.59879 [LOGITS Ex2 A] Mean Abs: 2.082 | Max: 5.516 [LOSS Ex2] A: 0.11719 | B: 0.35197 | C: 0.23409 ** [JOINT LOSS] ** : 0.844172 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.015495 | Grad Max: 0.481567 -> Layer: shared_layers.0.bias | Grad Mean: 1.396227 | Grad Max: 6.374991 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.006215 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011007 | Grad Max: 0.011007 -> Layer: exit2_layers.0.weight | Grad Mean: 0.009258 | Grad Max: 1.310631 -> Layer: exit2_layers.0.bias | Grad Mean: 0.172538 | Grad Max: 7.253920 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000930 | Grad Max: 0.026989 -> Layer: exit2_layers.3.bias | Grad Mean: 0.097892 | Grad Max: 0.485254 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000128 | Grad Max: 0.001212 -> Layer: exit2_layers.6.bias | Grad Mean: 0.019253 | Grad Max: 0.037220 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.000902 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004817 | Grad Max: 0.014913 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002034 | Grad Max: 0.004765 -> Layer: exit2_layers.12.bias | Grad Mean: 0.064382 | Grad Max: 0.064382 [GRADIENT NORM TOTAL] 30.8616 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.951 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5415582 0.4584418] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 693/1355 | B: 625/1231 | C: 658/1390 [LOSS Ex1] A: 0.63074 | B: 0.60454 | C: 0.59795 [LOGITS Ex2 A] Mean Abs: 2.086 | Max: 6.303 [LOSS Ex2] A: 0.11533 | B: 0.34524 | C: 0.21621 ** [JOINT LOSS] ** : 0.836670 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014633 | Grad Max: 0.413523 -> Layer: shared_layers.0.bias | Grad Mean: 1.235357 | Grad Max: 5.719989 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002049 | Grad Max: 0.005455 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008703 | Grad Max: 0.008703 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008129 | Grad Max: 1.166060 -> Layer: exit2_layers.0.bias | Grad Mean: 0.151406 | Grad Max: 6.436661 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000830 | Grad Max: 0.024344 -> Layer: exit2_layers.3.bias | Grad Mean: 0.087128 | Grad Max: 0.436899 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000115 | Grad Max: 0.001100 -> Layer: exit2_layers.6.bias | Grad Mean: 0.017232 | Grad Max: 0.033187 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000908 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004303 | Grad Max: 0.014079 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001828 | Grad Max: 0.004378 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057246 | Grad Max: 0.057246 [GRADIENT NORM TOTAL] 26.9739 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.196 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8309583 0.16904166] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 747/1301 | B: 676/1372 | C: 645/1403 [LOSS Ex1] A: 0.62403 | B: 0.60477 | C: 0.60211 [LOGITS Ex2 A] Mean Abs: 2.165 | Max: 6.873 [LOSS Ex2] A: 0.09053 | B: 0.32233 | C: 0.20309 ** [JOINT LOSS] ** : 0.815617 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003730 | Grad Max: 0.201734 -> Layer: shared_layers.0.bias | Grad Mean: 0.485077 | Grad Max: 2.755146 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002119 | Grad Max: 0.005916 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004359 | Grad Max: 0.004359 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003082 | Grad Max: 0.492783 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057136 | Grad Max: 2.723402 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000287 | Grad Max: 0.009168 -> Layer: exit2_layers.3.bias | Grad Mean: 0.030954 | Grad Max: 0.157951 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000037 | Grad Max: 0.000407 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005901 | Grad Max: 0.012561 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000015 | Grad Max: 0.000296 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001456 | Grad Max: 0.004758 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000590 | Grad Max: 0.002096 -> Layer: exit2_layers.12.bias | Grad Mean: 0.019392 | Grad Max: 0.019392 [GRADIENT NORM TOTAL] 11.1007 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.330 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008096 0.49919042] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.088 [MASKS] A(Pass/Fail): 726/1322 | B: 664/1384 | C: 679/1369 [LOSS Ex1] A: 0.63137 | B: 0.60502 | C: 0.59840 [LOGITS Ex2 A] Mean Abs: 2.265 | Max: 6.798 [LOSS Ex2] A: 0.08806 | B: 0.31233 | C: 0.20962 ** [JOINT LOSS] ** : 0.814933 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008138 | Grad Max: 0.351002 -> Layer: shared_layers.0.bias | Grad Mean: 0.926143 | Grad Max: 4.856454 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.005448 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000216 | Grad Max: 0.000216 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005819 | Grad Max: 0.989143 -> Layer: exit2_layers.0.bias | Grad Mean: 0.108718 | Grad Max: 5.546182 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000575 | Grad Max: 0.017449 -> Layer: exit2_layers.3.bias | Grad Mean: 0.061298 | Grad Max: 0.321369 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000737 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011967 | Grad Max: 0.023447 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000579 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002977 | Grad Max: 0.009540 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001165 | Grad Max: 0.003558 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038263 | Grad Max: 0.038263 [GRADIENT NORM TOTAL] 20.6755 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.936 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75404525 0.24595474] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 673/1375 | C: 665/1383 [LOSS Ex1] A: 0.62628 | B: 0.60051 | C: 0.59962 [LOGITS Ex2 A] Mean Abs: 2.251 | Max: 6.400 [LOSS Ex2] A: 0.11754 | B: 0.33876 | C: 0.24214 ** [JOINT LOSS] ** : 0.841614 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012690 | Grad Max: 0.573189 -> Layer: shared_layers.0.bias | Grad Mean: 1.509564 | Grad Max: 7.740436 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.005088 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000396 | Grad Max: 0.000396 -> Layer: exit2_layers.0.weight | Grad Mean: 0.009585 | Grad Max: 1.493466 -> Layer: exit2_layers.0.bias | Grad Mean: 0.178645 | Grad Max: 8.291449 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000922 | Grad Max: 0.027532 -> Layer: exit2_layers.3.bias | Grad Mean: 0.098510 | Grad Max: 0.492495 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000125 | Grad Max: 0.001177 -> Layer: exit2_layers.6.bias | Grad Mean: 0.019276 | Grad Max: 0.036812 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000050 | Grad Max: 0.000982 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004804 | Grad Max: 0.015995 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001953 | Grad Max: 0.004572 -> Layer: exit2_layers.12.bias | Grad Mean: 0.062996 | Grad Max: 0.062996 [GRADIENT NORM TOTAL] 33.7659 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.090 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6476921 0.35230792] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 590/1026 | B: 625/1231 | C: 678/1370 [LOSS Ex1] A: 0.62438 | B: 0.60448 | C: 0.60086 [LOGITS Ex2 A] Mean Abs: 2.309 | Max: 7.232 [LOSS Ex2] A: 0.10220 | B: 0.32787 | C: 0.21470 ** [JOINT LOSS] ** : 0.824830 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011408 | Grad Max: 0.483699 -> Layer: shared_layers.0.bias | Grad Mean: 1.256945 | Grad Max: 6.387123 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005658 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003209 | Grad Max: 0.003209 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007972 | Grad Max: 1.196292 -> Layer: exit2_layers.0.bias | Grad Mean: 0.148294 | Grad Max: 6.610897 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000774 | Grad Max: 0.023026 -> Layer: exit2_layers.3.bias | Grad Mean: 0.082366 | Grad Max: 0.411196 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000106 | Grad Max: 0.001023 -> Layer: exit2_layers.6.bias | Grad Mean: 0.016277 | Grad Max: 0.031388 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000798 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004153 | Grad Max: 0.012893 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001755 | Grad Max: 0.004518 -> Layer: exit2_layers.12.bias | Grad Mean: 0.056076 | Grad Max: 0.056076 [GRADIENT NORM TOTAL] 27.6184 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.330 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083298 0.4916702] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 728/1320 | B: 676/1372 | C: 675/1373 [LOSS Ex1] A: 0.62515 | B: 0.60471 | C: 0.59915 [LOGITS Ex2 A] Mean Abs: 2.244 | Max: 7.936 [LOSS Ex2] A: 0.09098 | B: 0.31119 | C: 0.21058 ** [JOINT LOSS] ** : 0.813918 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005876 | Grad Max: 0.218066 -> Layer: shared_layers.0.bias | Grad Mean: 0.610699 | Grad Max: 3.005422 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002079 | Grad Max: 0.005478 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003927 | Grad Max: 0.003927 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003934 | Grad Max: 0.588001 -> Layer: exit2_layers.0.bias | Grad Mean: 0.073182 | Grad Max: 3.261060 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.012607 -> Layer: exit2_layers.3.bias | Grad Mean: 0.040831 | Grad Max: 0.208376 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000053 | Grad Max: 0.000531 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008020 | Grad Max: 0.016290 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000402 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002005 | Grad Max: 0.006243 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000780 | Grad Max: 0.002772 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025785 | Grad Max: 0.025785 [GRADIENT NORM TOTAL] 13.6125 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.246 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5043194 0.4956806] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 721/1327 | B: 664/1384 | C: 671/1377 [LOSS Ex1] A: 0.62181 | B: 0.60496 | C: 0.59831 [LOGITS Ex2 A] Mean Abs: 2.169 | Max: 6.745 [LOSS Ex2] A: 0.09990 | B: 0.30342 | C: 0.21889 ** [JOINT LOSS] ** : 0.815762 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006835 | Grad Max: 0.227219 -> Layer: shared_layers.0.bias | Grad Mean: 0.585403 | Grad Max: 3.097494 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.005570 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000833 | Grad Max: 0.000833 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003540 | Grad Max: 0.856583 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065935 | Grad Max: 4.733838 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000334 | Grad Max: 0.011307 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035126 | Grad Max: 0.186538 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000509 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006965 | Grad Max: 0.014325 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000363 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001777 | Grad Max: 0.006067 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000786 | Grad Max: 0.002656 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024349 | Grad Max: 0.024349 [GRADIENT NORM TOTAL] 13.5486 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.284 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51178336 0.48821664] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 725/1323 | B: 673/1375 | C: 667/1381 [LOSS Ex1] A: 0.62076 | B: 0.60046 | C: 0.60086 [LOGITS Ex2 A] Mean Abs: 2.106 | Max: 7.492 [LOSS Ex2] A: 0.10934 | B: 0.31171 | C: 0.22163 ** [JOINT LOSS] ** : 0.821585 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010253 | Grad Max: 0.350202 -> Layer: shared_layers.0.bias | Grad Mean: 0.991926 | Grad Max: 4.647109 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002169 | Grad Max: 0.005717 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004810 | Grad Max: 0.004810 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006262 | Grad Max: 1.051634 -> Layer: exit2_layers.0.bias | Grad Mean: 0.117396 | Grad Max: 5.802754 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000638 | Grad Max: 0.020951 -> Layer: exit2_layers.3.bias | Grad Mean: 0.067643 | Grad Max: 0.356323 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.000856 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013279 | Grad Max: 0.026948 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000685 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003385 | Grad Max: 0.010964 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001406 | Grad Max: 0.003719 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045002 | Grad Max: 0.045002 [GRADIENT NORM TOTAL] 21.7245 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.063 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50032 0.49967998] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.088 [MASKS] A(Pass/Fail): 690/1358 | B: 625/1231 | C: 684/1364 [LOSS Ex1] A: 0.62981 | B: 0.60443 | C: 0.60090 [LOGITS Ex2 A] Mean Abs: 2.091 | Max: 6.880 [LOSS Ex2] A: 0.10184 | B: 0.31230 | C: 0.21331 ** [JOINT LOSS] ** : 0.820864 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009675 | Grad Max: 0.310430 -> Layer: shared_layers.0.bias | Grad Mean: 0.917102 | Grad Max: 4.105142 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002022 | Grad Max: 0.005471 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002392 | Grad Max: 0.002392 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005876 | Grad Max: 1.015756 -> Layer: exit2_layers.0.bias | Grad Mean: 0.109613 | Grad Max: 5.629456 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000599 | Grad Max: 0.018799 -> Layer: exit2_layers.3.bias | Grad Mean: 0.063172 | Grad Max: 0.327054 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000870 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012403 | Grad Max: 0.025008 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000663 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003112 | Grad Max: 0.010500 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001338 | Grad Max: 0.003715 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041015 | Grad Max: 0.041015 [GRADIENT NORM TOTAL] 20.2504 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.952 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5414626 0.45853743] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 693/1355 | B: 676/1372 | C: 676/1372 [LOSS Ex1] A: 0.63065 | B: 0.60466 | C: 0.59568 [LOGITS Ex2 A] Mean Abs: 2.128 | Max: 6.020 [LOSS Ex2] A: 0.09972 | B: 0.30935 | C: 0.20459 ** [JOINT LOSS] ** : 0.814883 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003722 | Grad Max: 0.095721 -> Layer: shared_layers.0.bias | Grad Mean: 0.289827 | Grad Max: 1.389866 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.005749 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009965 | Grad Max: 0.009965 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001944 | Grad Max: 0.620611 -> Layer: exit2_layers.0.bias | Grad Mean: 0.035508 | Grad Max: 3.496173 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000180 | Grad Max: 0.005763 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018590 | Grad Max: 0.094042 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000025 | Grad Max: 0.000316 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003809 | Grad Max: 0.008412 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000232 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000947 | Grad Max: 0.003722 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000434 | Grad Max: 0.002025 -> Layer: exit2_layers.12.bias | Grad Mean: 0.012131 | Grad Max: 0.012131 [GRADIENT NORM TOTAL] 7.3195 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.198 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8312995 0.16870058] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 747/1301 | B: 664/1384 | C: 672/1376 [LOSS Ex1] A: 0.62394 | B: 0.60491 | C: 0.59798 [LOGITS Ex2 A] Mean Abs: 2.212 | Max: 7.226 [LOSS Ex2] A: 0.10803 | B: 0.32319 | C: 0.20083 ** [JOINT LOSS] ** : 0.819625 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011984 | Grad Max: 0.420977 -> Layer: shared_layers.0.bias | Grad Mean: 1.042645 | Grad Max: 5.702648 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002124 | Grad Max: 0.006064 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000153 | Grad Max: 0.000153 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006493 | Grad Max: 1.324240 -> Layer: exit2_layers.0.bias | Grad Mean: 0.120091 | Grad Max: 7.351596 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000615 | Grad Max: 0.018477 -> Layer: exit2_layers.3.bias | Grad Mean: 0.065285 | Grad Max: 0.306428 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000826 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013103 | Grad Max: 0.025561 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000681 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003341 | Grad Max: 0.010421 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001348 | Grad Max: 0.003889 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043681 | Grad Max: 0.043681 [GRADIENT NORM TOTAL] 23.4887 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.332 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50084245 0.49915755] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 673/1375 | C: 667/1381 [LOSS Ex1] A: 0.63129 | B: 0.60040 | C: 0.60223 [LOGITS Ex2 A] Mean Abs: 2.254 | Max: 5.890 [LOSS Ex2] A: 0.10448 | B: 0.33430 | C: 0.25261 ** [JOINT LOSS] ** : 0.841770 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.015344 | Grad Max: 0.631404 -> Layer: shared_layers.0.bias | Grad Mean: 1.580361 | Grad Max: 8.315900 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002092 | Grad Max: 0.005019 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004480 | Grad Max: 0.004480 -> Layer: exit2_layers.0.weight | Grad Mean: 0.009919 | Grad Max: 1.707251 -> Layer: exit2_layers.0.bias | Grad Mean: 0.184946 | Grad Max: 9.513133 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000962 | Grad Max: 0.028312 -> Layer: exit2_layers.3.bias | Grad Mean: 0.102550 | Grad Max: 0.511825 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000133 | Grad Max: 0.001259 -> Layer: exit2_layers.6.bias | Grad Mean: 0.020324 | Grad Max: 0.038861 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000055 | Grad Max: 0.001005 -> Layer: exit2_layers.9.bias | Grad Mean: 0.005215 | Grad Max: 0.016692 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002139 | Grad Max: 0.005191 -> Layer: exit2_layers.12.bias | Grad Mean: 0.069059 | Grad Max: 0.069059 [GRADIENT NORM TOTAL] 35.1114 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.937 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75429577 0.24570425] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 625/1231 | C: 433/943 [LOSS Ex1] A: 0.62619 | B: 0.60437 | C: 0.60538 [LOGITS Ex2 A] Mean Abs: 2.236 | Max: 6.447 [LOSS Ex2] A: 0.11590 | B: 0.32659 | C: 0.22910 ** [JOINT LOSS] ** : 0.835842 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014054 | Grad Max: 0.489677 -> Layer: shared_layers.0.bias | Grad Mean: 1.264088 | Grad Max: 6.436862 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005185 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001868 | Grad Max: 0.001868 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007995 | Grad Max: 1.360477 -> Layer: exit2_layers.0.bias | Grad Mean: 0.148098 | Grad Max: 7.557583 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000783 | Grad Max: 0.023798 -> Layer: exit2_layers.3.bias | Grad Mean: 0.083062 | Grad Max: 0.413053 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000110 | Grad Max: 0.001073 -> Layer: exit2_layers.6.bias | Grad Mean: 0.016768 | Grad Max: 0.032966 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000046 | Grad Max: 0.000786 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004351 | Grad Max: 0.013083 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001810 | Grad Max: 0.004578 -> Layer: exit2_layers.12.bias | Grad Mean: 0.058630 | Grad Max: 0.058630 [GRADIENT NORM TOTAL] 27.6909 [EPOCH SUMMARY] Train Loss: 0.8259 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.7980 | Alpha: 0.5500 No improve count: 5/15 ############################## EPOCH 179/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.092 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64778376 0.35221627] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 590/1026 | B: 676/1372 | C: 676/1372 [LOSS Ex1] A: 0.62429 | B: 0.60461 | C: 0.60094 [LOGITS Ex2 A] Mean Abs: 2.238 | Max: 11.010 [LOSS Ex2] A: 0.10081 | B: 0.30919 | C: 0.20766 ** [JOINT LOSS] ** : 0.815836 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006441 | Grad Max: 0.223745 -> Layer: shared_layers.0.bias | Grad Mean: 0.581253 | Grad Max: 2.850693 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.005370 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006442 | Grad Max: 0.006442 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003537 | Grad Max: 0.548643 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065670 | Grad Max: 3.037905 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000350 | Grad Max: 0.011980 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036843 | Grad Max: 0.195814 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000539 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007364 | Grad Max: 0.015009 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000375 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001865 | Grad Max: 0.005987 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000765 | Grad Max: 0.002722 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024483 | Grad Max: 0.024483 [GRADIENT NORM TOTAL] 12.5158 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.333 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083524 0.4916476] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 728/1320 | B: 664/1384 | C: 682/1366 [LOSS Ex1] A: 0.62506 | B: 0.60486 | C: 0.60183 [LOGITS Ex2 A] Mean Abs: 2.142 | Max: 8.550 [LOSS Ex2] A: 0.09107 | B: 0.31905 | C: 0.22110 ** [JOINT LOSS] ** : 0.820994 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006629 | Grad Max: 0.217803 -> Layer: shared_layers.0.bias | Grad Mean: 0.624401 | Grad Max: 2.761595 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005495 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000852 | Grad Max: 0.000852 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004220 | Grad Max: 0.580231 -> Layer: exit2_layers.0.bias | Grad Mean: 0.078670 | Grad Max: 3.208545 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000429 | Grad Max: 0.013728 -> Layer: exit2_layers.3.bias | Grad Mean: 0.046005 | Grad Max: 0.220534 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000060 | Grad Max: 0.000677 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009165 | Grad Max: 0.019732 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000531 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002350 | Grad Max: 0.008490 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001012 | Grad Max: 0.003261 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031983 | Grad Max: 0.031983 [GRADIENT NORM TOTAL] 13.9306 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.248 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5042411 0.4957589] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 720/1328 | B: 673/1375 | C: 666/1382 [LOSS Ex1] A: 0.62173 | B: 0.60036 | C: 0.60293 [LOGITS Ex2 A] Mean Abs: 2.127 | Max: 7.436 [LOSS Ex2] A: 0.10279 | B: 0.32214 | C: 0.20089 ** [JOINT LOSS] ** : 0.816945 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009765 | Grad Max: 0.338026 -> Layer: shared_layers.0.bias | Grad Mean: 0.957211 | Grad Max: 4.494227 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002200 | Grad Max: 0.005773 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007534 | Grad Max: 0.007534 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006349 | Grad Max: 1.071605 -> Layer: exit2_layers.0.bias | Grad Mean: 0.118654 | Grad Max: 5.917881 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000621 | Grad Max: 0.020885 -> Layer: exit2_layers.3.bias | Grad Mean: 0.066322 | Grad Max: 0.351665 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000855 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013081 | Grad Max: 0.025685 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000696 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003308 | Grad Max: 0.011082 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001366 | Grad Max: 0.003612 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043120 | Grad Max: 0.043120 [GRADIENT NORM TOTAL] 21.7681 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.286 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51185 0.48815003] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 725/1323 | B: 625/1231 | C: 728/1320 [LOSS Ex1] A: 0.62067 | B: 0.60432 | C: 0.59106 [LOGITS Ex2 A] Mean Abs: 2.095 | Max: 6.099 [LOSS Ex2] A: 0.11178 | B: 0.30989 | C: 0.20256 ** [JOINT LOSS] ** : 0.813425 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007969 | Grad Max: 0.333315 -> Layer: shared_layers.0.bias | Grad Mean: 0.867433 | Grad Max: 4.286891 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002199 | Grad Max: 0.005940 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001924 | Grad Max: 0.001924 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005557 | Grad Max: 0.918778 -> Layer: exit2_layers.0.bias | Grad Mean: 0.103954 | Grad Max: 5.065749 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000549 | Grad Max: 0.016703 -> Layer: exit2_layers.3.bias | Grad Mean: 0.058875 | Grad Max: 0.281448 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000765 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011546 | Grad Max: 0.023796 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000631 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002938 | Grad Max: 0.010108 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001221 | Grad Max: 0.003486 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038688 | Grad Max: 0.038688 [GRADIENT NORM TOTAL] 19.3267 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.064 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003217 0.4996783] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 690/1358 | B: 676/1372 | C: 700/1348 [LOSS Ex1] A: 0.62973 | B: 0.60457 | C: 0.59634 [LOGITS Ex2 A] Mean Abs: 2.137 | Max: 5.424 [LOSS Ex2] A: 0.09548 | B: 0.30624 | C: 0.21640 ** [JOINT LOSS] ** : 0.816251 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002847 | Grad Max: 0.105010 -> Layer: shared_layers.0.bias | Grad Mean: 0.290213 | Grad Max: 1.462826 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002039 | Grad Max: 0.005705 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009280 | Grad Max: 0.009280 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001788 | Grad Max: 0.233357 -> Layer: exit2_layers.0.bias | Grad Mean: 0.032828 | Grad Max: 1.302356 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000168 | Grad Max: 0.005994 -> Layer: exit2_layers.3.bias | Grad Mean: 0.018046 | Grad Max: 0.104424 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000023 | Grad Max: 0.000316 -> Layer: exit2_layers.6.bias | Grad Mean: 0.003478 | Grad Max: 0.008410 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000010 | Grad Max: 0.000232 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000843 | Grad Max: 0.003549 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000420 | Grad Max: 0.002078 -> Layer: exit2_layers.12.bias | Grad Mean: 0.011647 | Grad Max: 0.011647 [GRADIENT NORM TOTAL] 6.1569 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.954 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5413787 0.45862132] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 693/1355 | B: 664/1384 | C: 643/1405 [LOSS Ex1] A: 0.63057 | B: 0.60481 | C: 0.60104 [LOGITS Ex2 A] Mean Abs: 2.199 | Max: 6.035 [LOSS Ex2] A: 0.10271 | B: 0.32502 | C: 0.21349 ** [JOINT LOSS] ** : 0.825877 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008517 | Grad Max: 0.368495 -> Layer: shared_layers.0.bias | Grad Mean: 1.002329 | Grad Max: 4.946062 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.005361 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009546 | Grad Max: 0.009546 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006288 | Grad Max: 1.070747 -> Layer: exit2_layers.0.bias | Grad Mean: 0.117348 | Grad Max: 5.916045 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000619 | Grad Max: 0.019206 -> Layer: exit2_layers.3.bias | Grad Mean: 0.066876 | Grad Max: 0.327694 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000863 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013285 | Grad Max: 0.026612 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000695 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003434 | Grad Max: 0.010514 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001415 | Grad Max: 0.003864 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045398 | Grad Max: 0.045398 [GRADIENT NORM TOTAL] 22.3637 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.200 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8316228 0.16837719] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 747/1301 | B: 673/1375 | C: 647/1401 [LOSS Ex1] A: 0.62386 | B: 0.60030 | C: 0.59851 [LOGITS Ex2 A] Mean Abs: 2.237 | Max: 7.101 [LOSS Ex2] A: 0.11859 | B: 0.34749 | C: 0.23693 ** [JOINT LOSS] ** : 0.841895 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.016028 | Grad Max: 0.598278 -> Layer: shared_layers.0.bias | Grad Mean: 1.612443 | Grad Max: 8.055904 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002190 | Grad Max: 0.005621 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000474 | Grad Max: 0.000474 -> Layer: exit2_layers.0.weight | Grad Mean: 0.010207 | Grad Max: 1.544098 -> Layer: exit2_layers.0.bias | Grad Mean: 0.189630 | Grad Max: 8.640054 -> Layer: exit2_layers.3.weight | Grad Mean: 0.001010 | Grad Max: 0.029349 -> Layer: exit2_layers.3.bias | Grad Mean: 0.108224 | Grad Max: 0.525109 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000141 | Grad Max: 0.001435 -> Layer: exit2_layers.6.bias | Grad Mean: 0.021612 | Grad Max: 0.042000 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000059 | Grad Max: 0.001139 -> Layer: exit2_layers.9.bias | Grad Mean: 0.005566 | Grad Max: 0.017823 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002324 | Grad Max: 0.005489 -> Layer: exit2_layers.12.bias | Grad Mean: 0.073517 | Grad Max: 0.073517 [GRADIENT NORM TOTAL] 35.3616 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.334 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5008749 0.49912515] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 625/1231 | C: 672/1376 [LOSS Ex1] A: 0.63120 | B: 0.60427 | C: 0.60061 [LOGITS Ex2 A] Mean Abs: 2.238 | Max: 6.311 [LOSS Ex2] A: 0.11064 | B: 0.34524 | C: 0.25644 ** [JOINT LOSS] ** : 0.849467 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014291 | Grad Max: 0.495274 -> Layer: shared_layers.0.bias | Grad Mean: 1.425849 | Grad Max: 6.624592 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002053 | Grad Max: 0.005452 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003590 | Grad Max: 0.003590 -> Layer: exit2_layers.0.weight | Grad Mean: 0.009085 | Grad Max: 1.330263 -> Layer: exit2_layers.0.bias | Grad Mean: 0.169754 | Grad Max: 7.335798 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000909 | Grad Max: 0.028273 -> Layer: exit2_layers.3.bias | Grad Mean: 0.097472 | Grad Max: 0.484537 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000127 | Grad Max: 0.001222 -> Layer: exit2_layers.6.bias | Grad Mean: 0.019403 | Grad Max: 0.037876 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000054 | Grad Max: 0.001045 -> Layer: exit2_layers.9.bias | Grad Mean: 0.005017 | Grad Max: 0.016646 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002079 | Grad Max: 0.004780 -> Layer: exit2_layers.12.bias | Grad Mean: 0.066100 | Grad Max: 0.066100 [GRADIENT NORM TOTAL] 30.7631 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.939 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75455636 0.24544364] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 676/1372 | C: 704/1344 [LOSS Ex1] A: 0.62610 | B: 0.60451 | C: 0.59889 [LOGITS Ex2 A] Mean Abs: 2.200 | Max: 6.398 [LOSS Ex2] A: 0.11245 | B: 0.31611 | C: 0.21232 ** [JOINT LOSS] ** : 0.823463 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006240 | Grad Max: 0.236515 -> Layer: shared_layers.0.bias | Grad Mean: 0.711698 | Grad Max: 3.210121 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.006094 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000309 | Grad Max: 0.000309 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004517 | Grad Max: 0.665101 -> Layer: exit2_layers.0.bias | Grad Mean: 0.084343 | Grad Max: 3.653263 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000474 | Grad Max: 0.014833 -> Layer: exit2_layers.3.bias | Grad Mean: 0.051033 | Grad Max: 0.255775 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000657 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010116 | Grad Max: 0.020944 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000561 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002626 | Grad Max: 0.008284 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001062 | Grad Max: 0.003246 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034629 | Grad Max: 0.034629 [GRADIENT NORM TOTAL] 15.1068 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.093 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.647944 0.352056] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 590/1026 | B: 664/1384 | C: 663/1385 [LOSS Ex1] A: 0.62420 | B: 0.60476 | C: 0.60062 [LOGITS Ex2 A] Mean Abs: 2.154 | Max: 9.626 [LOSS Ex2] A: 0.09728 | B: 0.30914 | C: 0.21092 ** [JOINT LOSS] ** : 0.815636 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009556 | Grad Max: 0.272860 -> Layer: shared_layers.0.bias | Grad Mean: 0.696616 | Grad Max: 3.675936 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002096 | Grad Max: 0.005648 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009743 | Grad Max: 0.009743 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004138 | Grad Max: 0.933104 -> Layer: exit2_layers.0.bias | Grad Mean: 0.076476 | Grad Max: 5.163054 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000401 | Grad Max: 0.012262 -> Layer: exit2_layers.3.bias | Grad Mean: 0.041810 | Grad Max: 0.220111 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000613 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008555 | Grad Max: 0.017202 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000439 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002226 | Grad Max: 0.007068 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001016 | Grad Max: 0.003303 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030432 | Grad Max: 0.030432 [GRADIENT NORM TOTAL] 15.4797 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.335 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5083989 0.49160108] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 673/1375 | C: 652/1396 [LOSS Ex1] A: 0.62498 | B: 0.60025 | C: 0.60158 [LOGITS Ex2 A] Mean Abs: 2.118 | Max: 7.771 [LOSS Ex2] A: 0.09648 | B: 0.32054 | C: 0.21210 ** [JOINT LOSS] ** : 0.818645 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.013624 | Grad Max: 0.368030 -> Layer: shared_layers.0.bias | Grad Mean: 1.120992 | Grad Max: 5.025382 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.005339 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002526 | Grad Max: 0.002526 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006983 | Grad Max: 1.146319 -> Layer: exit2_layers.0.bias | Grad Mean: 0.129799 | Grad Max: 6.337554 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000725 | Grad Max: 0.020706 -> Layer: exit2_layers.3.bias | Grad Mean: 0.076519 | Grad Max: 0.381745 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000103 | Grad Max: 0.001077 -> Layer: exit2_layers.6.bias | Grad Mean: 0.015424 | Grad Max: 0.030703 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000044 | Grad Max: 0.000800 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003963 | Grad Max: 0.013128 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001693 | Grad Max: 0.004503 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052305 | Grad Max: 0.052305 [GRADIENT NORM TOTAL] 23.7567 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.250 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5041692 0.4958307] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 720/1328 | B: 625/1231 | C: 639/1409 [LOSS Ex1] A: 0.62164 | B: 0.60421 | C: 0.60219 [LOGITS Ex2 A] Mean Abs: 2.106 | Max: 7.304 [LOSS Ex2] A: 0.10431 | B: 0.33565 | C: 0.21990 ** [JOINT LOSS] ** : 0.829302 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011736 | Grad Max: 0.358405 -> Layer: shared_layers.0.bias | Grad Mean: 1.038439 | Grad Max: 4.684211 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.005492 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001884 | Grad Max: 0.001884 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006428 | Grad Max: 0.989724 -> Layer: exit2_layers.0.bias | Grad Mean: 0.119218 | Grad Max: 5.487039 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000658 | Grad Max: 0.018414 -> Layer: exit2_layers.3.bias | Grad Mean: 0.069797 | Grad Max: 0.329592 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000093 | Grad Max: 0.000926 -> Layer: exit2_layers.6.bias | Grad Mean: 0.014103 | Grad Max: 0.027276 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000752 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003658 | Grad Max: 0.011856 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001578 | Grad Max: 0.003902 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048762 | Grad Max: 0.048762 [GRADIENT NORM TOTAL] 21.9857 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.288 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5119436 0.48805642] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 726/1322 | B: 676/1372 | C: 685/1363 [LOSS Ex1] A: 0.62058 | B: 0.60446 | C: 0.59540 [LOGITS Ex2 A] Mean Abs: 2.127 | Max: 6.011 [LOSS Ex2] A: 0.10485 | B: 0.31134 | C: 0.20016 ** [JOINT LOSS] ** : 0.812267 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004016 | Grad Max: 0.177553 -> Layer: shared_layers.0.bias | Grad Mean: 0.419811 | Grad Max: 2.440453 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002227 | Grad Max: 0.006514 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003037 | Grad Max: 0.003037 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002462 | Grad Max: 0.455359 -> Layer: exit2_layers.0.bias | Grad Mean: 0.045314 | Grad Max: 2.506955 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000222 | Grad Max: 0.007705 -> Layer: exit2_layers.3.bias | Grad Mean: 0.023397 | Grad Max: 0.138219 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000030 | Grad Max: 0.000329 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004612 | Grad Max: 0.010213 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000013 | Grad Max: 0.000290 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001225 | Grad Max: 0.003785 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000547 | Grad Max: 0.002299 -> Layer: exit2_layers.12.bias | Grad Mean: 0.017153 | Grad Max: 0.017153 [GRADIENT NORM TOTAL] 9.3301 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.066 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003635 0.49963647] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 688/1360 | B: 664/1384 | C: 425/951 [LOSS Ex1] A: 0.62963 | B: 0.60470 | C: 0.60330 [LOGITS Ex2 A] Mean Abs: 2.163 | Max: 5.662 [LOSS Ex2] A: 0.10351 | B: 0.31160 | C: 0.22496 ** [JOINT LOSS] ** : 0.825904 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008084 | Grad Max: 0.227132 -> Layer: shared_layers.0.bias | Grad Mean: 0.695873 | Grad Max: 3.225359 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002000 | Grad Max: 0.005259 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006890 | Grad Max: 0.006890 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004484 | Grad Max: 0.759811 -> Layer: exit2_layers.0.bias | Grad Mean: 0.083223 | Grad Max: 4.253567 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000430 | Grad Max: 0.014776 -> Layer: exit2_layers.3.bias | Grad Mean: 0.045808 | Grad Max: 0.251773 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000655 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009168 | Grad Max: 0.018644 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000453 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002359 | Grad Max: 0.007281 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000942 | Grad Max: 0.002946 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030365 | Grad Max: 0.030365 [GRADIENT NORM TOTAL] 15.6298 [EPOCH SUMMARY] Train Loss: 0.8233 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8140 | Alpha: 0.5500 No improve count: 6/15 ############################## EPOCH 180/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.955 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5412638 0.45873618] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 693/1355 | B: 674/1374 | C: 661/1387 [LOSS Ex1] A: 0.63048 | B: 0.60019 | C: 0.60583 [LOGITS Ex2 A] Mean Abs: 2.184 | Max: 6.055 [LOSS Ex2] A: 0.11990 | B: 0.32010 | C: 0.22744 ** [JOINT LOSS] ** : 0.834645 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012204 | Grad Max: 0.417684 -> Layer: shared_layers.0.bias | Grad Mean: 1.140199 | Grad Max: 5.679196 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002073 | Grad Max: 0.005966 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012628 | Grad Max: 0.012628 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007102 | Grad Max: 1.288027 -> Layer: exit2_layers.0.bias | Grad Mean: 0.132314 | Grad Max: 7.170654 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000687 | Grad Max: 0.020348 -> Layer: exit2_layers.3.bias | Grad Mean: 0.073394 | Grad Max: 0.351500 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000097 | Grad Max: 0.000999 -> Layer: exit2_layers.6.bias | Grad Mean: 0.014642 | Grad Max: 0.029316 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000041 | Grad Max: 0.000759 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003771 | Grad Max: 0.011659 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001551 | Grad Max: 0.003993 -> Layer: exit2_layers.12.bias | Grad Mean: 0.049233 | Grad Max: 0.049233 [GRADIENT NORM TOTAL] 25.1413 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.202 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8319328 0.16806722] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 747/1301 | B: 626/1230 | C: 665/1383 [LOSS Ex1] A: 0.62378 | B: 0.60415 | C: 0.59981 [LOGITS Ex2 A] Mean Abs: 2.191 | Max: 6.391 [LOSS Ex2] A: 0.11037 | B: 0.30026 | C: 0.21245 ** [JOINT LOSS] ** : 0.816939 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011974 | Grad Max: 0.313637 -> Layer: shared_layers.0.bias | Grad Mean: 0.864074 | Grad Max: 4.338882 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.005765 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003586 | Grad Max: 0.003586 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005484 | Grad Max: 0.945302 -> Layer: exit2_layers.0.bias | Grad Mean: 0.101382 | Grad Max: 5.257596 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000547 | Grad Max: 0.016274 -> Layer: exit2_layers.3.bias | Grad Mean: 0.057359 | Grad Max: 0.278352 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000753 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011722 | Grad Max: 0.022523 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000600 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003084 | Grad Max: 0.009637 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001305 | Grad Max: 0.003684 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040956 | Grad Max: 0.040956 [GRADIENT NORM TOTAL] 18.8770 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.336 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5009262 0.49907383] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 676/1372 | C: 712/1336 [LOSS Ex1] A: 0.63112 | B: 0.60440 | C: 0.58453 [LOGITS Ex2 A] Mean Abs: 2.161 | Max: 5.975 [LOSS Ex2] A: 0.09550 | B: 0.30381 | C: 0.20544 ** [JOINT LOSS] ** : 0.808270 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.001980 | Grad Max: 0.062157 -> Layer: shared_layers.0.bias | Grad Mean: 0.114037 | Grad Max: 0.883530 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002128 | Grad Max: 0.005563 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007189 | Grad Max: 0.007189 -> Layer: exit2_layers.0.weight | Grad Mean: 0.000739 | Grad Max: 0.177871 -> Layer: exit2_layers.0.bias | Grad Mean: 0.012901 | Grad Max: 0.953984 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000061 | Grad Max: 0.002454 -> Layer: exit2_layers.3.bias | Grad Mean: 0.005609 | Grad Max: 0.035070 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000008 | Grad Max: 0.000143 -> Layer: exit2_layers.6.bias | Grad Mean: 0.001001 | Grad Max: 0.004031 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000005 | Grad Max: 0.000135 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000241 | Grad Max: 0.001256 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000443 | Grad Max: 0.001144 -> Layer: exit2_layers.12.bias | Grad Mean: 0.002470 | Grad Max: 0.002470 [GRADIENT NORM TOTAL] 2.6280 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.940 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75479656 0.2452034 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 664/1384 | C: 654/1394 [LOSS Ex1] A: 0.62601 | B: 0.60464 | C: 0.60668 [LOGITS Ex2 A] Mean Abs: 2.083 | Max: 6.197 [LOSS Ex2] A: 0.11674 | B: 0.32863 | C: 0.21200 ** [JOINT LOSS] ** : 0.831562 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010734 | Grad Max: 0.321227 -> Layer: shared_layers.0.bias | Grad Mean: 0.951551 | Grad Max: 4.186574 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002019 | Grad Max: 0.005428 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003641 | Grad Max: 0.003641 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005879 | Grad Max: 0.729337 -> Layer: exit2_layers.0.bias | Grad Mean: 0.109604 | Grad Max: 4.022493 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000598 | Grad Max: 0.017524 -> Layer: exit2_layers.3.bias | Grad Mean: 0.063893 | Grad Max: 0.326924 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.000855 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012841 | Grad Max: 0.025220 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000713 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003331 | Grad Max: 0.011406 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001403 | Grad Max: 0.003672 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043774 | Grad Max: 0.043774 [GRADIENT NORM TOTAL] 19.7634 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.095 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64798343 0.35201657] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 591/1025 | B: 674/1374 | C: 687/1361 [LOSS Ex1] A: 0.62408 | B: 0.60013 | C: 0.60138 [LOGITS Ex2 A] Mean Abs: 2.109 | Max: 8.538 [LOSS Ex2] A: 0.11295 | B: 0.33531 | C: 0.22216 ** [JOINT LOSS] ** : 0.832004 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.013421 | Grad Max: 0.375179 -> Layer: shared_layers.0.bias | Grad Mean: 1.102917 | Grad Max: 5.012146 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002123 | Grad Max: 0.005150 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000821 | Grad Max: 0.000821 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007031 | Grad Max: 0.938939 -> Layer: exit2_layers.0.bias | Grad Mean: 0.130475 | Grad Max: 5.176515 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000710 | Grad Max: 0.021349 -> Layer: exit2_layers.3.bias | Grad Mean: 0.075510 | Grad Max: 0.379250 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000101 | Grad Max: 0.001005 -> Layer: exit2_layers.6.bias | Grad Mean: 0.015270 | Grad Max: 0.029988 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000862 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003986 | Grad Max: 0.012963 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001693 | Grad Max: 0.004164 -> Layer: exit2_layers.12.bias | Grad Mean: 0.052847 | Grad Max: 0.052847 [GRADIENT NORM TOTAL] 23.4721 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.338 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084681 0.49153194] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 626/1230 | C: 661/1387 [LOSS Ex1] A: 0.62486 | B: 0.60408 | C: 0.60208 [LOGITS Ex2 A] Mean Abs: 2.103 | Max: 7.781 [LOSS Ex2] A: 0.10128 | B: 0.32336 | C: 0.20980 ** [JOINT LOSS] ** : 0.821822 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009699 | Grad Max: 0.290844 -> Layer: shared_layers.0.bias | Grad Mean: 0.863885 | Grad Max: 3.835822 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002067 | Grad Max: 0.005571 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000499 | Grad Max: 0.000499 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005464 | Grad Max: 0.707984 -> Layer: exit2_layers.0.bias | Grad Mean: 0.102292 | Grad Max: 3.909624 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000581 | Grad Max: 0.019454 -> Layer: exit2_layers.3.bias | Grad Mean: 0.061994 | Grad Max: 0.331032 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.000811 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012329 | Grad Max: 0.024060 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000713 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003182 | Grad Max: 0.011476 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001347 | Grad Max: 0.003843 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041738 | Grad Max: 0.041738 [GRADIENT NORM TOTAL] 18.2734 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.252 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5039877 0.49601227] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 719/1329 | B: 676/1372 | C: 659/1389 [LOSS Ex1] A: 0.62153 | B: 0.60433 | C: 0.60141 [LOGITS Ex2 A] Mean Abs: 2.153 | Max: 7.709 [LOSS Ex2] A: 0.09776 | B: 0.30647 | C: 0.20947 ** [JOINT LOSS] ** : 0.813658 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002779 | Grad Max: 0.080755 -> Layer: shared_layers.0.bias | Grad Mean: 0.137526 | Grad Max: 0.804731 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002197 | Grad Max: 0.006276 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005578 | Grad Max: 0.005578 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001008 | Grad Max: 0.205395 -> Layer: exit2_layers.0.bias | Grad Mean: 0.017566 | Grad Max: 1.158248 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000047 | Grad Max: 0.002540 -> Layer: exit2_layers.3.bias | Grad Mean: 0.003553 | Grad Max: 0.041280 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000006 | Grad Max: 0.000113 -> Layer: exit2_layers.6.bias | Grad Mean: 0.000538 | Grad Max: 0.002948 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000004 | Grad Max: 0.000125 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000133 | Grad Max: 0.001467 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000416 | Grad Max: 0.001415 -> Layer: exit2_layers.12.bias | Grad Mean: 0.001049 | Grad Max: 0.001049 [GRADIENT NORM TOTAL] 3.5706 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.291 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51209366 0.48790634] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 665/1383 | C: 685/1363 [LOSS Ex1] A: 0.62046 | B: 0.60456 | C: 0.59781 [LOGITS Ex2 A] Mean Abs: 2.200 | Max: 6.803 [LOSS Ex2] A: 0.12469 | B: 0.32587 | C: 0.22481 ** [JOINT LOSS] ** : 0.832734 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011909 | Grad Max: 0.346940 -> Layer: shared_layers.0.bias | Grad Mean: 1.006631 | Grad Max: 4.741487 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005736 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001583 | Grad Max: 0.001583 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006523 | Grad Max: 0.953069 -> Layer: exit2_layers.0.bias | Grad Mean: 0.119816 | Grad Max: 5.293045 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000631 | Grad Max: 0.018263 -> Layer: exit2_layers.3.bias | Grad Mean: 0.066879 | Grad Max: 0.330969 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.000988 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013616 | Grad Max: 0.027816 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000740 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003521 | Grad Max: 0.011463 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001468 | Grad Max: 0.003901 -> Layer: exit2_layers.12.bias | Grad Mean: 0.045794 | Grad Max: 0.045794 [GRADIENT NORM TOTAL] 22.2375 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.068 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004195 0.49958047] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 674/1374 | C: 682/1366 [LOSS Ex1] A: 0.62949 | B: 0.60004 | C: 0.59861 [LOGITS Ex2 A] Mean Abs: 2.199 | Max: 6.147 [LOSS Ex2] A: 0.10060 | B: 0.33004 | C: 0.22327 ** [JOINT LOSS] ** : 0.827350 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011831 | Grad Max: 0.495697 -> Layer: shared_layers.0.bias | Grad Mean: 1.300507 | Grad Max: 6.744269 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002107 | Grad Max: 0.005129 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000592 | Grad Max: 0.000592 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008131 | Grad Max: 1.193854 -> Layer: exit2_layers.0.bias | Grad Mean: 0.151456 | Grad Max: 6.680124 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000774 | Grad Max: 0.023513 -> Layer: exit2_layers.3.bias | Grad Mean: 0.083604 | Grad Max: 0.430340 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000109 | Grad Max: 0.001083 -> Layer: exit2_layers.6.bias | Grad Mean: 0.016764 | Grad Max: 0.031962 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000047 | Grad Max: 0.000924 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004399 | Grad Max: 0.013944 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001806 | Grad Max: 0.004696 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057706 | Grad Max: 0.057706 [GRADIENT NORM TOTAL] 28.6012 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.957 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5411277 0.45887235] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 626/1230 | C: 653/1395 [LOSS Ex1] A: 0.63035 | B: 0.60398 | C: 0.60618 [LOGITS Ex2 A] Mean Abs: 2.167 | Max: 6.610 [LOSS Ex2] A: 0.10819 | B: 0.31185 | C: 0.21101 ** [JOINT LOSS] ** : 0.823855 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007445 | Grad Max: 0.369206 -> Layer: shared_layers.0.bias | Grad Mean: 0.929726 | Grad Max: 4.902701 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001978 | Grad Max: 0.005190 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008448 | Grad Max: 0.008448 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005785 | Grad Max: 0.861867 -> Layer: exit2_layers.0.bias | Grad Mean: 0.107875 | Grad Max: 4.777722 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000564 | Grad Max: 0.018446 -> Layer: exit2_layers.3.bias | Grad Mean: 0.061285 | Grad Max: 0.331534 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000782 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012272 | Grad Max: 0.024042 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000620 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003265 | Grad Max: 0.009971 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001351 | Grad Max: 0.003892 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043441 | Grad Max: 0.043441 [GRADIENT NORM TOTAL] 20.6043 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.205 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.83235884 0.16764121] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 676/1372 | C: 661/1387 [LOSS Ex1] A: 0.62365 | B: 0.60423 | C: 0.59803 [LOGITS Ex2 A] Mean Abs: 2.144 | Max: 7.407 [LOSS Ex2] A: 0.09185 | B: 0.30656 | C: 0.20691 ** [JOINT LOSS] ** : 0.810412 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.002076 | Grad Max: 0.092863 -> Layer: shared_layers.0.bias | Grad Mean: 0.219473 | Grad Max: 1.097072 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002198 | Grad Max: 0.005902 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006709 | Grad Max: 0.006709 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001438 | Grad Max: 0.281888 -> Layer: exit2_layers.0.bias | Grad Mean: 0.026424 | Grad Max: 1.562408 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000108 | Grad Max: 0.004788 -> Layer: exit2_layers.3.bias | Grad Mean: 0.011264 | Grad Max: 0.073815 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000015 | Grad Max: 0.000200 -> Layer: exit2_layers.6.bias | Grad Mean: 0.002220 | Grad Max: 0.005439 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000007 | Grad Max: 0.000164 -> Layer: exit2_layers.9.bias | Grad Mean: 0.000590 | Grad Max: 0.002139 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000479 | Grad Max: 0.001594 -> Layer: exit2_layers.12.bias | Grad Mean: 0.007801 | Grad Max: 0.007801 [GRADIENT NORM TOTAL] 5.5189 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.340 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50100964 0.49899033] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 665/1383 | C: 709/1339 [LOSS Ex1] A: 0.63098 | B: 0.60446 | C: 0.59885 [LOGITS Ex2 A] Mean Abs: 2.086 | Max: 5.655 [LOSS Ex2] A: 0.10284 | B: 0.33029 | C: 0.21974 ** [JOINT LOSS] ** : 0.829058 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012635 | Grad Max: 0.329334 -> Layer: shared_layers.0.bias | Grad Mean: 0.946360 | Grad Max: 4.471603 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.005470 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000405 | Grad Max: 0.000405 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005914 | Grad Max: 1.109685 -> Layer: exit2_layers.0.bias | Grad Mean: 0.109519 | Grad Max: 6.144238 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000596 | Grad Max: 0.018218 -> Layer: exit2_layers.3.bias | Grad Mean: 0.062609 | Grad Max: 0.316035 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000086 | Grad Max: 0.000942 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012640 | Grad Max: 0.024925 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000708 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003307 | Grad Max: 0.011044 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001467 | Grad Max: 0.003986 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044371 | Grad Max: 0.044371 [GRADIENT NORM TOTAL] 20.6673 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.942 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7551747 0.24482533] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 674/1374 | C: 714/1334 [LOSS Ex1] A: 0.62587 | B: 0.59994 | C: 0.59296 [LOGITS Ex2 A] Mean Abs: 2.048 | Max: 6.952 [LOSS Ex2] A: 0.13758 | B: 0.33670 | C: 0.24319 ** [JOINT LOSS] ** : 0.845414 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.019723 | Grad Max: 0.483343 -> Layer: shared_layers.0.bias | Grad Mean: 1.338564 | Grad Max: 5.760572 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002177 | Grad Max: 0.005506 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007868 | Grad Max: 0.007868 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008505 | Grad Max: 1.278497 -> Layer: exit2_layers.0.bias | Grad Mean: 0.157512 | Grad Max: 7.074866 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000884 | Grad Max: 0.024046 -> Layer: exit2_layers.3.bias | Grad Mean: 0.092796 | Grad Max: 0.441927 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000129 | Grad Max: 0.001360 -> Layer: exit2_layers.6.bias | Grad Mean: 0.018974 | Grad Max: 0.036242 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000057 | Grad Max: 0.001067 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004980 | Grad Max: 0.016062 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002225 | Grad Max: 0.005274 -> Layer: exit2_layers.12.bias | Grad Mean: 0.067175 | Grad Max: 0.067175 [GRADIENT NORM TOTAL] 28.1168 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.098 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6481313 0.3518687] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 590/1026 | B: 626/1230 | C: 459/917 [LOSS Ex1] A: 0.62395 | B: 0.60388 | C: 0.59590 [LOGITS Ex2 A] Mean Abs: 2.094 | Max: 8.766 [LOSS Ex2] A: 0.12198 | B: 0.32832 | C: 0.21034 ** [JOINT LOSS] ** : 0.828125 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.016165 | Grad Max: 0.405707 -> Layer: shared_layers.0.bias | Grad Mean: 1.145751 | Grad Max: 5.206437 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005741 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008732 | Grad Max: 0.008732 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007477 | Grad Max: 1.100398 -> Layer: exit2_layers.0.bias | Grad Mean: 0.138557 | Grad Max: 6.091342 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000771 | Grad Max: 0.023124 -> Layer: exit2_layers.3.bias | Grad Mean: 0.081271 | Grad Max: 0.402823 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000112 | Grad Max: 0.001275 -> Layer: exit2_layers.6.bias | Grad Mean: 0.016571 | Grad Max: 0.032262 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000049 | Grad Max: 0.000943 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004323 | Grad Max: 0.014284 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001903 | Grad Max: 0.004730 -> Layer: exit2_layers.12.bias | Grad Mean: 0.057321 | Grad Max: 0.057321 [GRADIENT NORM TOTAL] 24.7965 [EPOCH SUMMARY] Train Loss: 0.8254 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.7993 | Alpha: 0.5500 No improve count: 7/15 ############################## EPOCH 181/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.341 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50849855 0.49150142] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 676/1372 | C: 693/1355 [LOSS Ex1] A: 0.62473 | B: 0.60414 | C: 0.59546 [LOGITS Ex2 A] Mean Abs: 2.157 | Max: 8.328 [LOSS Ex2] A: 0.09471 | B: 0.30944 | C: 0.18726 ** [JOINT LOSS] ** : 0.805250 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005664 | Grad Max: 0.186252 -> Layer: shared_layers.0.bias | Grad Mean: 0.457221 | Grad Max: 2.534513 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002129 | Grad Max: 0.005753 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000278 | Grad Max: 0.000278 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002925 | Grad Max: 0.746389 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053859 | Grad Max: 4.133531 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000265 | Grad Max: 0.007987 -> Layer: exit2_layers.3.bias | Grad Mean: 0.028209 | Grad Max: 0.134859 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000038 | Grad Max: 0.000482 -> Layer: exit2_layers.6.bias | Grad Mean: 0.005749 | Grad Max: 0.012015 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000017 | Grad Max: 0.000352 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001476 | Grad Max: 0.005421 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000652 | Grad Max: 0.002723 -> Layer: exit2_layers.12.bias | Grad Mean: 0.018941 | Grad Max: 0.018941 [GRADIENT NORM TOTAL] 11.4711 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.255 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5039017 0.49609825] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 717/1331 | B: 665/1383 | C: 656/1392 [LOSS Ex1] A: 0.62140 | B: 0.60437 | C: 0.60388 [LOGITS Ex2 A] Mean Abs: 2.219 | Max: 6.596 [LOSS Ex2] A: 0.10139 | B: 0.31783 | C: 0.22297 ** [JOINT LOSS] ** : 0.823947 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008539 | Grad Max: 0.279840 -> Layer: shared_layers.0.bias | Grad Mean: 0.793455 | Grad Max: 3.857102 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002104 | Grad Max: 0.005457 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000193 | Grad Max: 0.000193 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005153 | Grad Max: 0.856730 -> Layer: exit2_layers.0.bias | Grad Mean: 0.095923 | Grad Max: 4.726748 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000514 | Grad Max: 0.015879 -> Layer: exit2_layers.3.bias | Grad Mean: 0.055067 | Grad Max: 0.294880 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000761 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011015 | Grad Max: 0.021720 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000628 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002875 | Grad Max: 0.009213 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001173 | Grad Max: 0.003495 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037139 | Grad Max: 0.037139 [GRADIENT NORM TOTAL] 17.6390 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.294 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51216656 0.48783347] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 674/1374 | C: 744/1304 [LOSS Ex1] A: 0.62034 | B: 0.59986 | C: 0.59689 [LOGITS Ex2 A] Mean Abs: 2.225 | Max: 6.730 [LOSS Ex2] A: 0.12271 | B: 0.33230 | C: 0.24036 ** [JOINT LOSS] ** : 0.837485 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014152 | Grad Max: 0.457765 -> Layer: shared_layers.0.bias | Grad Mean: 1.273891 | Grad Max: 6.270208 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002144 | Grad Max: 0.006460 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003907 | Grad Max: 0.003907 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008284 | Grad Max: 1.216679 -> Layer: exit2_layers.0.bias | Grad Mean: 0.153394 | Grad Max: 6.722294 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000831 | Grad Max: 0.024538 -> Layer: exit2_layers.3.bias | Grad Mean: 0.088969 | Grad Max: 0.441241 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000119 | Grad Max: 0.001370 -> Layer: exit2_layers.6.bias | Grad Mean: 0.018018 | Grad Max: 0.035067 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.001023 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004717 | Grad Max: 0.015310 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002023 | Grad Max: 0.004641 -> Layer: exit2_layers.12.bias | Grad Mean: 0.062207 | Grad Max: 0.062207 [GRADIENT NORM TOTAL] 27.7806 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.071 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5003985 0.49960142] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 626/1230 | C: 680/1368 [LOSS Ex1] A: 0.62938 | B: 0.60379 | C: 0.60289 [LOGITS Ex2 A] Mean Abs: 2.194 | Max: 5.970 [LOSS Ex2] A: 0.10496 | B: 0.31596 | C: 0.22621 ** [JOINT LOSS] ** : 0.827729 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012660 | Grad Max: 0.384954 -> Layer: shared_layers.0.bias | Grad Mean: 1.058751 | Grad Max: 5.209836 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002006 | Grad Max: 0.005592 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009363 | Grad Max: 0.009363 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006749 | Grad Max: 1.008286 -> Layer: exit2_layers.0.bias | Grad Mean: 0.125143 | Grad Max: 5.641489 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000666 | Grad Max: 0.020895 -> Layer: exit2_layers.3.bias | Grad Mean: 0.070671 | Grad Max: 0.366014 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000095 | Grad Max: 0.000986 -> Layer: exit2_layers.6.bias | Grad Mean: 0.014341 | Grad Max: 0.027955 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000042 | Grad Max: 0.000773 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003817 | Grad Max: 0.011849 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001591 | Grad Max: 0.004279 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050014 | Grad Max: 0.050014 [GRADIENT NORM TOTAL] 22.9888 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.142 | Max: 0.959 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5410217 0.4589783] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 676/1372 | C: 706/1342 [LOSS Ex1] A: 0.63025 | B: 0.60406 | C: 0.59508 [LOGITS Ex2 A] Mean Abs: 2.155 | Max: 5.743 [LOSS Ex2] A: 0.10266 | B: 0.29375 | C: 0.22097 ** [JOINT LOSS] ** : 0.815593 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004250 | Grad Max: 0.110176 -> Layer: shared_layers.0.bias | Grad Mean: 0.274328 | Grad Max: 1.155234 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002049 | Grad Max: 0.005808 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011570 | Grad Max: 0.011570 -> Layer: exit2_layers.0.weight | Grad Mean: 0.001923 | Grad Max: 0.265539 -> Layer: exit2_layers.0.bias | Grad Mean: 0.034682 | Grad Max: 1.465674 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000197 | Grad Max: 0.005152 -> Layer: exit2_layers.3.bias | Grad Mean: 0.020679 | Grad Max: 0.093007 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000029 | Grad Max: 0.000353 -> Layer: exit2_layers.6.bias | Grad Mean: 0.004337 | Grad Max: 0.009845 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000012 | Grad Max: 0.000235 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001136 | Grad Max: 0.003302 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000475 | Grad Max: 0.001912 -> Layer: exit2_layers.12.bias | Grad Mean: 0.014468 | Grad Max: 0.014468 [GRADIENT NORM TOTAL] 6.0497 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.207 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8328265 0.16717349] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 665/1383 | C: 653/1395 [LOSS Ex1] A: 0.62354 | B: 0.60430 | C: 0.60497 [LOGITS Ex2 A] Mean Abs: 2.098 | Max: 8.174 [LOSS Ex2] A: 0.09703 | B: 0.32433 | C: 0.20397 ** [JOINT LOSS] ** : 0.819378 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006866 | Grad Max: 0.343116 -> Layer: shared_layers.0.bias | Grad Mean: 0.865469 | Grad Max: 4.591135 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005275 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000942 | Grad Max: 0.000942 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005267 | Grad Max: 1.021369 -> Layer: exit2_layers.0.bias | Grad Mean: 0.098702 | Grad Max: 5.628523 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000508 | Grad Max: 0.015962 -> Layer: exit2_layers.3.bias | Grad Mean: 0.055446 | Grad Max: 0.283971 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000732 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011034 | Grad Max: 0.022032 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000650 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002891 | Grad Max: 0.009746 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001219 | Grad Max: 0.003573 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038200 | Grad Max: 0.038200 [GRADIENT NORM TOTAL] 19.2770 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.342 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010339 0.49896613] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 674/1374 | C: 719/1329 [LOSS Ex1] A: 0.63088 | B: 0.59978 | C: 0.60187 [LOGITS Ex2 A] Mean Abs: 2.072 | Max: 5.962 [LOSS Ex2] A: 0.10446 | B: 0.34119 | C: 0.22165 ** [JOINT LOSS] ** : 0.833278 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.014789 | Grad Max: 0.439451 -> Layer: shared_layers.0.bias | Grad Mean: 1.276376 | Grad Max: 5.837375 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.004999 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000715 | Grad Max: 0.000715 -> Layer: exit2_layers.0.weight | Grad Mean: 0.008127 | Grad Max: 1.157853 -> Layer: exit2_layers.0.bias | Grad Mean: 0.151024 | Grad Max: 6.398695 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000830 | Grad Max: 0.023440 -> Layer: exit2_layers.3.bias | Grad Mean: 0.088756 | Grad Max: 0.438844 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000120 | Grad Max: 0.001290 -> Layer: exit2_layers.6.bias | Grad Mean: 0.017996 | Grad Max: 0.035083 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.001076 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004738 | Grad Max: 0.015818 -> Layer: exit2_layers.12.weight | Grad Mean: 0.002000 | Grad Max: 0.004744 -> Layer: exit2_layers.12.bias | Grad Mean: 0.062002 | Grad Max: 0.062002 [GRADIENT NORM TOTAL] 27.2938 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.944 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75549066 0.24450935] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 626/1230 | C: 774/1274 [LOSS Ex1] A: 0.62576 | B: 0.60372 | C: 0.59449 [LOGITS Ex2 A] Mean Abs: 2.044 | Max: 6.486 [LOSS Ex2] A: 0.12818 | B: 0.34887 | C: 0.22770 ** [JOINT LOSS] ** : 0.842904 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.015264 | Grad Max: 0.424163 -> Layer: shared_layers.0.bias | Grad Mean: 1.253330 | Grad Max: 5.622044 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002172 | Grad Max: 0.005281 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000437 | Grad Max: 0.000437 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007937 | Grad Max: 1.155499 -> Layer: exit2_layers.0.bias | Grad Mean: 0.147581 | Grad Max: 6.366715 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000821 | Grad Max: 0.024125 -> Layer: exit2_layers.3.bias | Grad Mean: 0.087292 | Grad Max: 0.428988 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000119 | Grad Max: 0.001286 -> Layer: exit2_layers.6.bias | Grad Mean: 0.017647 | Grad Max: 0.036735 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000052 | Grad Max: 0.001010 -> Layer: exit2_layers.9.bias | Grad Mean: 0.004645 | Grad Max: 0.015039 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001998 | Grad Max: 0.004693 -> Layer: exit2_layers.12.bias | Grad Mean: 0.061202 | Grad Max: 0.061202 [GRADIENT NORM TOTAL] 26.4208 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.100 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64820623 0.35179377] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 589/1027 | B: 676/1372 | C: 755/1293 [LOSS Ex1] A: 0.62385 | B: 0.60400 | C: 0.58828 [LOGITS Ex2 A] Mean Abs: 2.169 | Max: 10.493 [LOSS Ex2] A: 0.10490 | B: 0.32351 | C: 0.19386 ** [JOINT LOSS] ** : 0.812794 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008289 | Grad Max: 0.261143 -> Layer: shared_layers.0.bias | Grad Mean: 0.737776 | Grad Max: 3.470728 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002171 | Grad Max: 0.005817 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006050 | Grad Max: 0.006050 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004660 | Grad Max: 0.667757 -> Layer: exit2_layers.0.bias | Grad Mean: 0.086252 | Grad Max: 3.681400 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000480 | Grad Max: 0.014508 -> Layer: exit2_layers.3.bias | Grad Mean: 0.051287 | Grad Max: 0.256809 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000787 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010392 | Grad Max: 0.021093 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000629 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002715 | Grad Max: 0.009586 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001166 | Grad Max: 0.003540 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035673 | Grad Max: 0.035673 [GRADIENT NORM TOTAL] 15.6769 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.343 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5084822 0.49151775] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 665/1383 | C: 746/1302 [LOSS Ex1] A: 0.62463 | B: 0.60423 | C: 0.59744 [LOGITS Ex2 A] Mean Abs: 2.210 | Max: 10.267 [LOSS Ex2] A: 0.09810 | B: 0.30835 | C: 0.21083 ** [JOINT LOSS] ** : 0.814525 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006288 | Grad Max: 0.208296 -> Layer: shared_layers.0.bias | Grad Mean: 0.572291 | Grad Max: 2.733333 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.005842 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000597 | Grad Max: 0.000597 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003602 | Grad Max: 0.675523 -> Layer: exit2_layers.0.bias | Grad Mean: 0.066386 | Grad Max: 3.778288 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000326 | Grad Max: 0.010676 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034805 | Grad Max: 0.193968 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000571 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007099 | Grad Max: 0.014458 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000408 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001900 | Grad Max: 0.006260 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000748 | Grad Max: 0.002647 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024381 | Grad Max: 0.024381 [GRADIENT NORM TOTAL] 13.2643 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.257 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037917 0.49620828] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.091 [MASKS] A(Pass/Fail): 717/1331 | B: 674/1374 | C: 719/1329 [LOSS Ex1] A: 0.62130 | B: 0.59972 | C: 0.60188 [LOGITS Ex2 A] Mean Abs: 2.243 | Max: 6.481 [LOSS Ex2] A: 0.10897 | B: 0.31553 | C: 0.24473 ** [JOINT LOSS] ** : 0.830712 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.012194 | Grad Max: 0.437807 -> Layer: shared_layers.0.bias | Grad Mean: 1.175393 | Grad Max: 5.886418 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002217 | Grad Max: 0.005743 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007046 | Grad Max: 0.007046 -> Layer: exit2_layers.0.weight | Grad Mean: 0.007387 | Grad Max: 1.249426 -> Layer: exit2_layers.0.bias | Grad Mean: 0.138085 | Grad Max: 6.945692 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000698 | Grad Max: 0.021153 -> Layer: exit2_layers.3.bias | Grad Mean: 0.074975 | Grad Max: 0.385964 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000100 | Grad Max: 0.001112 -> Layer: exit2_layers.6.bias | Grad Mean: 0.015049 | Grad Max: 0.030175 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000043 | Grad Max: 0.000838 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003945 | Grad Max: 0.013273 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001615 | Grad Max: 0.004017 -> Layer: exit2_layers.12.bias | Grad Mean: 0.050869 | Grad Max: 0.050869 [GRADIENT NORM TOTAL] 26.1125 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.153 | Max: 1.296 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51224107 0.48775893] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 626/1230 | C: 713/1335 [LOSS Ex1] A: 0.62024 | B: 0.60365 | C: 0.60253 [LOGITS Ex2 A] Mean Abs: 2.210 | Max: 7.454 [LOSS Ex2] A: 0.12129 | B: 0.31529 | C: 0.23862 ** [JOINT LOSS] ** : 0.833870 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011654 | Grad Max: 0.373325 -> Layer: shared_layers.0.bias | Grad Mean: 1.038238 | Grad Max: 5.151225 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002117 | Grad Max: 0.005741 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000863 | Grad Max: 0.000863 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006463 | Grad Max: 1.179326 -> Layer: exit2_layers.0.bias | Grad Mean: 0.120214 | Grad Max: 6.546393 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000632 | Grad Max: 0.019247 -> Layer: exit2_layers.3.bias | Grad Mean: 0.067839 | Grad Max: 0.344869 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.000923 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013626 | Grad Max: 0.026547 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000743 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003584 | Grad Max: 0.011475 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001474 | Grad Max: 0.003978 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046171 | Grad Max: 0.046171 [GRADIENT NORM TOTAL] 22.9715 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.072 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50040674 0.49959326] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 676/1372 | C: 728/1320 [LOSS Ex1] A: 0.62927 | B: 0.60394 | C: 0.60376 [LOGITS Ex2 A] Mean Abs: 2.151 | Max: 5.053 [LOSS Ex2] A: 0.10295 | B: 0.30788 | C: 0.22525 ** [JOINT LOSS] ** : 0.824350 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006617 | Grad Max: 0.223704 -> Layer: shared_layers.0.bias | Grad Mean: 0.552977 | Grad Max: 2.729132 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002020 | Grad Max: 0.005230 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000611 | Grad Max: 0.000611 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003421 | Grad Max: 0.431713 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064203 | Grad Max: 2.408695 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000352 | Grad Max: 0.010534 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037502 | Grad Max: 0.187746 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000550 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007374 | Grad Max: 0.014775 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000425 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001947 | Grad Max: 0.006431 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000806 | Grad Max: 0.002721 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025796 | Grad Max: 0.025796 [GRADIENT NORM TOTAL] 11.7462 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409509 0.45904914] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 665/1383 | C: 474/902 [LOSS Ex1] A: 0.63015 | B: 0.60417 | C: 0.59800 [LOGITS Ex2 A] Mean Abs: 2.068 | Max: 5.463 [LOSS Ex2] A: 0.10427 | B: 0.31660 | C: 0.19921 ** [JOINT LOSS] ** : 0.817461 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007959 | Grad Max: 0.201843 -> Layer: shared_layers.0.bias | Grad Mean: 0.598077 | Grad Max: 2.708833 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002085 | Grad Max: 0.005533 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008923 | Grad Max: 0.008923 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003854 | Grad Max: 0.765684 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070968 | Grad Max: 4.276421 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000376 | Grad Max: 0.011868 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039793 | Grad Max: 0.195335 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000055 | Grad Max: 0.000622 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008250 | Grad Max: 0.016679 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000486 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002204 | Grad Max: 0.007579 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000980 | Grad Max: 0.003380 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029630 | Grad Max: 0.029630 [GRADIENT NORM TOTAL] 13.3572 [EPOCH SUMMARY] Train Loss: 0.8242 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8153 | Alpha: 0.5500 No improve count: 8/15 ############################## EPOCH 182/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.83318764 0.16681242] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 674/1374 | C: 729/1319 [LOSS Ex1] A: 0.62344 | B: 0.59966 | C: 0.60014 [LOGITS Ex2 A] Mean Abs: 2.086 | Max: 6.796 [LOSS Ex2] A: 0.09502 | B: 0.31801 | C: 0.21735 ** [JOINT LOSS] ** : 0.817877 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009238 | Grad Max: 0.328264 -> Layer: shared_layers.0.bias | Grad Mean: 0.918675 | Grad Max: 4.423221 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002130 | Grad Max: 0.005750 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000387 | Grad Max: 0.000387 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005689 | Grad Max: 1.037361 -> Layer: exit2_layers.0.bias | Grad Mean: 0.106070 | Grad Max: 5.735444 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000566 | Grad Max: 0.017584 -> Layer: exit2_layers.3.bias | Grad Mean: 0.061050 | Grad Max: 0.311344 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000891 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012377 | Grad Max: 0.024518 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000710 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003273 | Grad Max: 0.011049 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001386 | Grad Max: 0.003822 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042882 | Grad Max: 0.042882 [GRADIENT NORM TOTAL] 20.1041 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010789 0.49892107] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 626/1230 | C: 712/1336 [LOSS Ex1] A: 0.63079 | B: 0.60360 | C: 0.59879 [LOGITS Ex2 A] Mean Abs: 2.119 | Max: 6.312 [LOSS Ex2] A: 0.09738 | B: 0.33013 | C: 0.20975 ** [JOINT LOSS] ** : 0.823484 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009656 | Grad Max: 0.319118 -> Layer: shared_layers.0.bias | Grad Mean: 0.954306 | Grad Max: 4.329672 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002036 | Grad Max: 0.005557 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002628 | Grad Max: 0.002628 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005919 | Grad Max: 0.947362 -> Layer: exit2_layers.0.bias | Grad Mean: 0.109781 | Grad Max: 5.246794 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000596 | Grad Max: 0.018699 -> Layer: exit2_layers.3.bias | Grad Mean: 0.063916 | Grad Max: 0.325148 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.000907 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012848 | Grad Max: 0.026566 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000700 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003360 | Grad Max: 0.011308 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001391 | Grad Max: 0.003764 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042972 | Grad Max: 0.042972 [GRADIENT NORM TOTAL] 20.5628 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7557117 0.24428837] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 676/1372 | C: 725/1323 [LOSS Ex1] A: 0.62568 | B: 0.60391 | C: 0.59543 [LOGITS Ex2 A] Mean Abs: 2.078 | Max: 6.209 [LOSS Ex2] A: 0.11882 | B: 0.34099 | C: 0.20717 ** [JOINT LOSS] ** : 0.830667 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011517 | Grad Max: 0.355626 -> Layer: shared_layers.0.bias | Grad Mean: 1.023392 | Grad Max: 4.862053 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002134 | Grad Max: 0.005331 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004903 | Grad Max: 0.004903 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006347 | Grad Max: 1.078242 -> Layer: exit2_layers.0.bias | Grad Mean: 0.118241 | Grad Max: 5.969844 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000631 | Grad Max: 0.019100 -> Layer: exit2_layers.3.bias | Grad Mean: 0.067553 | Grad Max: 0.353090 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.000975 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013604 | Grad Max: 0.026864 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000768 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003589 | Grad Max: 0.011984 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001513 | Grad Max: 0.003931 -> Layer: exit2_layers.12.bias | Grad Mean: 0.046795 | Grad Max: 0.046795 [GRADIENT NORM TOTAL] 22.3638 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482954 0.3517046] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 588/1028 | B: 665/1383 | C: 766/1282 [LOSS Ex1] A: 0.62377 | B: 0.60415 | C: 0.59766 [LOGITS Ex2 A] Mean Abs: 2.111 | Max: 9.845 [LOSS Ex2] A: 0.10910 | B: 0.34055 | C: 0.21249 ** [JOINT LOSS] ** : 0.829243 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011286 | Grad Max: 0.358085 -> Layer: shared_layers.0.bias | Grad Mean: 1.018807 | Grad Max: 4.721560 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.005758 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011130 | Grad Max: 0.011130 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006296 | Grad Max: 1.036957 -> Layer: exit2_layers.0.bias | Grad Mean: 0.117389 | Grad Max: 5.750445 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000631 | Grad Max: 0.017356 -> Layer: exit2_layers.3.bias | Grad Mean: 0.067817 | Grad Max: 0.318531 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000091 | Grad Max: 0.001014 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013719 | Grad Max: 0.026741 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000040 | Grad Max: 0.000830 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003633 | Grad Max: 0.012140 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001582 | Grad Max: 0.004081 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048661 | Grad Max: 0.048661 [GRADIENT NORM TOTAL] 22.0855 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085099 0.4914901] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 674/1374 | C: 721/1327 [LOSS Ex1] A: 0.62457 | B: 0.59966 | C: 0.59627 [LOGITS Ex2 A] Mean Abs: 2.114 | Max: 8.222 [LOSS Ex2] A: 0.10138 | B: 0.31365 | C: 0.22196 ** [JOINT LOSS] ** : 0.819165 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010435 | Grad Max: 0.318584 -> Layer: shared_layers.0.bias | Grad Mean: 0.916278 | Grad Max: 4.278434 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.005338 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001219 | Grad Max: 0.001219 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005772 | Grad Max: 0.947616 -> Layer: exit2_layers.0.bias | Grad Mean: 0.107413 | Grad Max: 5.256381 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000580 | Grad Max: 0.017105 -> Layer: exit2_layers.3.bias | Grad Mean: 0.062114 | Grad Max: 0.298469 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000920 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012585 | Grad Max: 0.024262 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000724 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003311 | Grad Max: 0.011043 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001422 | Grad Max: 0.003908 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043452 | Grad Max: 0.043452 [GRADIENT NORM TOTAL] 19.9367 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037942 0.49620578] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.091 [MASKS] A(Pass/Fail): 717/1331 | B: 626/1230 | C: 760/1288 [LOSS Ex1] A: 0.62126 | B: 0.60360 | C: 0.59082 [LOGITS Ex2 A] Mean Abs: 2.111 | Max: 7.276 [LOSS Ex2] A: 0.10507 | B: 0.32218 | C: 0.20565 ** [JOINT LOSS] ** : 0.816194 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009025 | Grad Max: 0.312053 -> Layer: shared_layers.0.bias | Grad Mean: 0.871481 | Grad Max: 4.107564 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002292 | Grad Max: 0.006247 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008052 | Grad Max: 0.008052 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005407 | Grad Max: 0.914251 -> Layer: exit2_layers.0.bias | Grad Mean: 0.100590 | Grad Max: 5.046986 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000531 | Grad Max: 0.014918 -> Layer: exit2_layers.3.bias | Grad Mean: 0.057300 | Grad Max: 0.286131 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000849 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011521 | Grad Max: 0.022789 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000668 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003007 | Grad Max: 0.010157 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001251 | Grad Max: 0.003456 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038673 | Grad Max: 0.038673 [GRADIENT NORM TOTAL] 18.9663 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122487 0.4877513] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 676/1372 | C: 739/1309 [LOSS Ex1] A: 0.62021 | B: 0.60391 | C: 0.59352 [LOGITS Ex2 A] Mean Abs: 2.074 | Max: 5.524 [LOSS Ex2] A: 0.11306 | B: 0.34876 | C: 0.21497 ** [JOINT LOSS] ** : 0.831477 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009067 | Grad Max: 0.375805 -> Layer: shared_layers.0.bias | Grad Mean: 0.995166 | Grad Max: 4.933049 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002218 | Grad Max: 0.005760 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002888 | Grad Max: 0.002888 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006170 | Grad Max: 1.024768 -> Layer: exit2_layers.0.bias | Grad Mean: 0.115370 | Grad Max: 5.675250 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000607 | Grad Max: 0.019684 -> Layer: exit2_layers.3.bias | Grad Mean: 0.065863 | Grad Max: 0.339035 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.000890 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013216 | Grad Max: 0.025752 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000755 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003535 | Grad Max: 0.011514 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001502 | Grad Max: 0.003905 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047425 | Grad Max: 0.047425 [GRADIENT NORM TOTAL] 21.8813 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004019 0.49959806] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 665/1383 | C: 736/1312 [LOSS Ex1] A: 0.62925 | B: 0.60415 | C: 0.60157 [LOGITS Ex2 A] Mean Abs: 2.053 | Max: 5.597 [LOSS Ex2] A: 0.10599 | B: 0.33440 | C: 0.21214 ** [JOINT LOSS] ** : 0.829170 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009965 | Grad Max: 0.349314 -> Layer: shared_layers.0.bias | Grad Mean: 0.980156 | Grad Max: 4.550669 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002001 | Grad Max: 0.005962 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009470 | Grad Max: 0.009470 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006099 | Grad Max: 1.020183 -> Layer: exit2_layers.0.bias | Grad Mean: 0.113485 | Grad Max: 5.648369 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000612 | Grad Max: 0.017375 -> Layer: exit2_layers.3.bias | Grad Mean: 0.065767 | Grad Max: 0.329037 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000088 | Grad Max: 0.000966 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013337 | Grad Max: 0.027341 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000768 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003555 | Grad Max: 0.011665 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001552 | Grad Max: 0.004132 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047783 | Grad Max: 0.047783 [GRADIENT NORM TOTAL] 21.2617 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409568 0.4590432] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 674/1374 | C: 700/1348 [LOSS Ex1] A: 0.63014 | B: 0.59966 | C: 0.60361 [LOGITS Ex2 A] Mean Abs: 2.040 | Max: 5.775 [LOSS Ex2] A: 0.11307 | B: 0.32230 | C: 0.22074 ** [JOINT LOSS] ** : 0.829839 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009811 | Grad Max: 0.347796 -> Layer: shared_layers.0.bias | Grad Mean: 0.946211 | Grad Max: 4.524896 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002069 | Grad Max: 0.005533 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010028 | Grad Max: 0.010028 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005898 | Grad Max: 1.020629 -> Layer: exit2_layers.0.bias | Grad Mean: 0.109581 | Grad Max: 5.628964 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000589 | Grad Max: 0.017480 -> Layer: exit2_layers.3.bias | Grad Mean: 0.063492 | Grad Max: 0.309484 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000910 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012827 | Grad Max: 0.025105 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000793 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003381 | Grad Max: 0.011985 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001432 | Grad Max: 0.003847 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044663 | Grad Max: 0.044663 [GRADIENT NORM TOTAL] 20.6961 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8331878 0.16681226] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 626/1230 | C: 720/1328 [LOSS Ex1] A: 0.62344 | B: 0.60360 | C: 0.59837 [LOGITS Ex2 A] Mean Abs: 2.083 | Max: 7.577 [LOSS Ex2] A: 0.10373 | B: 0.31781 | C: 0.20114 ** [JOINT LOSS] ** : 0.816032 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008428 | Grad Max: 0.328984 -> Layer: shared_layers.0.bias | Grad Mean: 0.900294 | Grad Max: 4.416506 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.005402 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004483 | Grad Max: 0.004483 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005546 | Grad Max: 0.935756 -> Layer: exit2_layers.0.bias | Grad Mean: 0.103707 | Grad Max: 5.182055 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000550 | Grad Max: 0.015305 -> Layer: exit2_layers.3.bias | Grad Mean: 0.059726 | Grad Max: 0.292637 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000788 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011979 | Grad Max: 0.023295 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000667 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003127 | Grad Max: 0.010713 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001274 | Grad Max: 0.003580 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040132 | Grad Max: 0.040132 [GRADIENT NORM TOTAL] 19.7080 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.501079 0.498921] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 676/1372 | C: 723/1325 [LOSS Ex1] A: 0.63079 | B: 0.60391 | C: 0.60271 [LOGITS Ex2 A] Mean Abs: 2.098 | Max: 6.690 [LOSS Ex2] A: 0.09511 | B: 0.34616 | C: 0.22206 ** [JOINT LOSS] ** : 0.833579 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010202 | Grad Max: 0.356772 -> Layer: shared_layers.0.bias | Grad Mean: 0.983559 | Grad Max: 4.595041 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001994 | Grad Max: 0.005807 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001700 | Grad Max: 0.001700 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006098 | Grad Max: 0.915612 -> Layer: exit2_layers.0.bias | Grad Mean: 0.113111 | Grad Max: 5.116548 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000608 | Grad Max: 0.019124 -> Layer: exit2_layers.3.bias | Grad Mean: 0.065711 | Grad Max: 0.330930 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.000898 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013364 | Grad Max: 0.025855 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000766 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003585 | Grad Max: 0.012157 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001525 | Grad Max: 0.003989 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047915 | Grad Max: 0.047915 [GRADIENT NORM TOTAL] 21.1086 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7557115 0.2442885] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 665/1383 | C: 715/1333 [LOSS Ex1] A: 0.62568 | B: 0.60415 | C: 0.60103 [LOGITS Ex2 A] Mean Abs: 2.067 | Max: 6.967 [LOSS Ex2] A: 0.11569 | B: 0.33727 | C: 0.22225 ** [JOINT LOSS] ** : 0.835356 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011395 | Grad Max: 0.336011 -> Layer: shared_layers.0.bias | Grad Mean: 0.978288 | Grad Max: 4.492485 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002059 | Grad Max: 0.005567 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005836 | Grad Max: 0.005836 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006125 | Grad Max: 0.872082 -> Layer: exit2_layers.0.bias | Grad Mean: 0.113528 | Grad Max: 4.914037 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000624 | Grad Max: 0.021101 -> Layer: exit2_layers.3.bias | Grad Mean: 0.066747 | Grad Max: 0.359656 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000090 | Grad Max: 0.001020 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013519 | Grad Max: 0.026197 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000841 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003564 | Grad Max: 0.011954 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001542 | Grad Max: 0.004162 -> Layer: exit2_layers.12.bias | Grad Mean: 0.047386 | Grad Max: 0.047386 [GRADIENT NORM TOTAL] 20.8331 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482948 0.35170516] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 588/1028 | B: 674/1374 | C: 716/1332 [LOSS Ex1] A: 0.62377 | B: 0.59966 | C: 0.60198 [LOGITS Ex2 A] Mean Abs: 2.112 | Max: 8.367 [LOSS Ex2] A: 0.10073 | B: 0.31615 | C: 0.20904 ** [JOINT LOSS] ** : 0.817113 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010066 | Grad Max: 0.308824 -> Layer: shared_layers.0.bias | Grad Mean: 0.888428 | Grad Max: 4.137380 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002103 | Grad Max: 0.005127 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003365 | Grad Max: 0.003365 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005545 | Grad Max: 0.812196 -> Layer: exit2_layers.0.bias | Grad Mean: 0.102532 | Grad Max: 4.601460 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000551 | Grad Max: 0.016639 -> Layer: exit2_layers.3.bias | Grad Mean: 0.059425 | Grad Max: 0.288355 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000912 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012101 | Grad Max: 0.025264 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000695 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003185 | Grad Max: 0.011023 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001357 | Grad Max: 0.003907 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041584 | Grad Max: 0.041584 [GRADIENT NORM TOTAL] 19.0164 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085098 0.49149016] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 626/1230 | C: 474/902 [LOSS Ex1] A: 0.62457 | B: 0.60360 | C: 0.60697 [LOGITS Ex2 A] Mean Abs: 2.110 | Max: 8.889 [LOSS Ex2] A: 0.09943 | B: 0.32316 | C: 0.23344 ** [JOINT LOSS] ** : 0.830394 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009674 | Grad Max: 0.309412 -> Layer: shared_layers.0.bias | Grad Mean: 0.880932 | Grad Max: 4.032309 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.005147 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002907 | Grad Max: 0.002907 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005584 | Grad Max: 0.737628 -> Layer: exit2_layers.0.bias | Grad Mean: 0.103539 | Grad Max: 4.133241 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000572 | Grad Max: 0.018245 -> Layer: exit2_layers.3.bias | Grad Mean: 0.061285 | Grad Max: 0.313257 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000082 | Grad Max: 0.000909 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012459 | Grad Max: 0.024784 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000687 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003309 | Grad Max: 0.010788 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001439 | Grad Max: 0.003741 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044072 | Grad Max: 0.044072 [GRADIENT NORM TOTAL] 18.7380 [EPOCH SUMMARY] Train Loss: 0.8257 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8139 | Alpha: 0.5500 No improve count: 9/15 ############################## EPOCH 183/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037937 0.49620625] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.091 [MASKS] A(Pass/Fail): 717/1331 | B: 676/1372 | C: 715/1333 [LOSS Ex1] A: 0.62126 | B: 0.60391 | C: 0.59680 [LOGITS Ex2 A] Mean Abs: 2.113 | Max: 7.228 [LOSS Ex2] A: 0.10723 | B: 0.34330 | C: 0.20203 ** [JOINT LOSS] ** : 0.824839 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009334 | Grad Max: 0.332464 -> Layer: shared_layers.0.bias | Grad Mean: 0.938294 | Grad Max: 4.528007 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.006025 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000711 | Grad Max: 0.000711 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005775 | Grad Max: 0.839907 -> Layer: exit2_layers.0.bias | Grad Mean: 0.107061 | Grad Max: 4.705668 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000562 | Grad Max: 0.016851 -> Layer: exit2_layers.3.bias | Grad Mean: 0.060957 | Grad Max: 0.301503 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000920 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012342 | Grad Max: 0.023988 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000036 | Grad Max: 0.000704 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003260 | Grad Max: 0.010736 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001357 | Grad Max: 0.003755 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042467 | Grad Max: 0.042467 [GRADIENT NORM TOTAL] 20.3454 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51224893 0.4877511 ] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 665/1383 | C: 750/1298 [LOSS Ex1] A: 0.62021 | B: 0.60415 | C: 0.59510 [LOGITS Ex2 A] Mean Abs: 2.069 | Max: 6.179 [LOSS Ex2] A: 0.11583 | B: 0.33434 | C: 0.21492 ** [JOINT LOSS] ** : 0.828186 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010901 | Grad Max: 0.333725 -> Layer: shared_layers.0.bias | Grad Mean: 0.966557 | Grad Max: 4.477579 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002165 | Grad Max: 0.006035 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002211 | Grad Max: 0.002211 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005926 | Grad Max: 0.819668 -> Layer: exit2_layers.0.bias | Grad Mean: 0.110791 | Grad Max: 4.579196 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000603 | Grad Max: 0.017248 -> Layer: exit2_layers.3.bias | Grad Mean: 0.064529 | Grad Max: 0.322295 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000087 | Grad Max: 0.000898 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012978 | Grad Max: 0.025532 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000038 | Grad Max: 0.000770 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003399 | Grad Max: 0.011905 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001469 | Grad Max: 0.004076 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044803 | Grad Max: 0.044803 [GRADIENT NORM TOTAL] 20.4285 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.500402 0.499598] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 674/1374 | C: 745/1303 [LOSS Ex1] A: 0.62925 | B: 0.59966 | C: 0.59992 [LOGITS Ex2 A] Mean Abs: 2.067 | Max: 6.821 [LOSS Ex2] A: 0.10496 | B: 0.31074 | C: 0.22393 ** [JOINT LOSS] ** : 0.822823 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009373 | Grad Max: 0.306000 -> Layer: shared_layers.0.bias | Grad Mean: 0.875792 | Grad Max: 4.080841 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.005388 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001553 | Grad Max: 0.001553 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005526 | Grad Max: 0.851832 -> Layer: exit2_layers.0.bias | Grad Mean: 0.102439 | Grad Max: 4.810162 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000552 | Grad Max: 0.016503 -> Layer: exit2_layers.3.bias | Grad Mean: 0.059435 | Grad Max: 0.309648 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000907 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012044 | Grad Max: 0.024258 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000715 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003143 | Grad Max: 0.010951 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001333 | Grad Max: 0.003885 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040906 | Grad Max: 0.040906 [GRADIENT NORM TOTAL] 18.9220 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.540956 0.45904404] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 626/1230 | C: 695/1353 [LOSS Ex1] A: 0.63014 | B: 0.60360 | C: 0.60423 [LOGITS Ex2 A] Mean Abs: 2.035 | Max: 5.606 [LOSS Ex2] A: 0.11250 | B: 0.32111 | C: 0.21203 ** [JOINT LOSS] ** : 0.827867 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010285 | Grad Max: 0.310242 -> Layer: shared_layers.0.bias | Grad Mean: 0.919015 | Grad Max: 4.116593 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002024 | Grad Max: 0.005194 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005020 | Grad Max: 0.005020 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005744 | Grad Max: 0.756661 -> Layer: exit2_layers.0.bias | Grad Mean: 0.106334 | Grad Max: 4.267666 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000593 | Grad Max: 0.016937 -> Layer: exit2_layers.3.bias | Grad Mean: 0.063877 | Grad Max: 0.307721 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000085 | Grad Max: 0.000957 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012962 | Grad Max: 0.026166 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000778 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003417 | Grad Max: 0.012093 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001425 | Grad Max: 0.004065 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044484 | Grad Max: 0.044484 [GRADIENT NORM TOTAL] 19.4332 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.83318764 0.16681238] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 676/1372 | C: 743/1305 [LOSS Ex1] A: 0.62344 | B: 0.60391 | C: 0.59136 [LOGITS Ex2 A] Mean Abs: 2.096 | Max: 8.004 [LOSS Ex2] A: 0.09998 | B: 0.34010 | C: 0.20492 ** [JOINT LOSS] ** : 0.821238 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008161 | Grad Max: 0.322716 -> Layer: shared_layers.0.bias | Grad Mean: 0.892944 | Grad Max: 4.386117 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002179 | Grad Max: 0.005812 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002130 | Grad Max: 0.002130 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005472 | Grad Max: 0.811661 -> Layer: exit2_layers.0.bias | Grad Mean: 0.102479 | Grad Max: 4.536824 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000532 | Grad Max: 0.016609 -> Layer: exit2_layers.3.bias | Grad Mean: 0.057735 | Grad Max: 0.304283 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000758 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011627 | Grad Max: 0.022511 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000698 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003087 | Grad Max: 0.010440 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001329 | Grad Max: 0.003668 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041215 | Grad Max: 0.041215 [GRADIENT NORM TOTAL] 19.4958 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50107914 0.49892083] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 665/1383 | C: 726/1322 [LOSS Ex1] A: 0.63079 | B: 0.60415 | C: 0.60011 [LOGITS Ex2 A] Mean Abs: 2.092 | Max: 6.269 [LOSS Ex2] A: 0.09521 | B: 0.32931 | C: 0.20837 ** [JOINT LOSS] ** : 0.822647 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009563 | Grad Max: 0.310565 -> Layer: shared_layers.0.bias | Grad Mean: 0.876339 | Grad Max: 4.038787 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002003 | Grad Max: 0.004859 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002812 | Grad Max: 0.002812 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005472 | Grad Max: 0.797187 -> Layer: exit2_layers.0.bias | Grad Mean: 0.101483 | Grad Max: 4.473042 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000551 | Grad Max: 0.016415 -> Layer: exit2_layers.3.bias | Grad Mean: 0.059399 | Grad Max: 0.296855 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000079 | Grad Max: 0.000846 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012066 | Grad Max: 0.023551 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000717 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003188 | Grad Max: 0.011172 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001360 | Grad Max: 0.003991 -> Layer: exit2_layers.12.bias | Grad Mean: 0.042233 | Grad Max: 0.042233 [GRADIENT NORM TOTAL] 18.7568 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.755711 0.24428894] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 674/1374 | C: 724/1324 [LOSS Ex1] A: 0.62568 | B: 0.59966 | C: 0.59840 [LOGITS Ex2 A] Mean Abs: 2.071 | Max: 7.110 [LOSS Ex2] A: 0.11372 | B: 0.31499 | C: 0.21961 ** [JOINT LOSS] ** : 0.824020 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009756 | Grad Max: 0.310434 -> Layer: shared_layers.0.bias | Grad Mean: 0.878269 | Grad Max: 4.071934 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.005580 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002244 | Grad Max: 0.002244 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005530 | Grad Max: 0.815279 -> Layer: exit2_layers.0.bias | Grad Mean: 0.102557 | Grad Max: 4.572691 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000552 | Grad Max: 0.016188 -> Layer: exit2_layers.3.bias | Grad Mean: 0.059350 | Grad Max: 0.292347 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000080 | Grad Max: 0.000862 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012019 | Grad Max: 0.024486 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000720 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003171 | Grad Max: 0.010825 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001363 | Grad Max: 0.003825 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041990 | Grad Max: 0.041990 [GRADIENT NORM TOTAL] 18.9368 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64829385 0.35170615] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 588/1028 | B: 626/1230 | C: 741/1307 [LOSS Ex1] A: 0.62377 | B: 0.60360 | C: 0.59688 [LOGITS Ex2 A] Mean Abs: 2.113 | Max: 8.321 [LOSS Ex2] A: 0.10375 | B: 0.32180 | C: 0.19963 ** [JOINT LOSS] ** : 0.816480 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010277 | Grad Max: 0.292904 -> Layer: shared_layers.0.bias | Grad Mean: 0.897750 | Grad Max: 3.931839 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002094 | Grad Max: 0.006046 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002853 | Grad Max: 0.002853 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005605 | Grad Max: 0.822566 -> Layer: exit2_layers.0.bias | Grad Mean: 0.104406 | Grad Max: 4.623491 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000580 | Grad Max: 0.018044 -> Layer: exit2_layers.3.bias | Grad Mean: 0.062167 | Grad Max: 0.316162 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000937 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012557 | Grad Max: 0.024992 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000739 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003267 | Grad Max: 0.011143 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001357 | Grad Max: 0.003798 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041569 | Grad Max: 0.041569 [GRADIENT NORM TOTAL] 19.1422 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085097 0.4914903] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 676/1372 | C: 718/1330 [LOSS Ex1] A: 0.62457 | B: 0.60391 | C: 0.59968 [LOGITS Ex2 A] Mean Abs: 2.116 | Max: 8.602 [LOSS Ex2] A: 0.09834 | B: 0.34199 | C: 0.18662 ** [JOINT LOSS] ** : 0.818368 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010389 | Grad Max: 0.324354 -> Layer: shared_layers.0.bias | Grad Mean: 0.921837 | Grad Max: 4.395842 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002086 | Grad Max: 0.005441 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001594 | Grad Max: 0.001594 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005794 | Grad Max: 0.868114 -> Layer: exit2_layers.0.bias | Grad Mean: 0.107543 | Grad Max: 4.874303 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000578 | Grad Max: 0.017294 -> Layer: exit2_layers.3.bias | Grad Mean: 0.061953 | Grad Max: 0.307217 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000949 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012583 | Grad Max: 0.024566 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000693 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003316 | Grad Max: 0.011122 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001413 | Grad Max: 0.003949 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043667 | Grad Max: 0.043667 [GRADIENT NORM TOTAL] 20.0238 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037931 0.4962069] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.091 [MASKS] A(Pass/Fail): 717/1331 | B: 665/1383 | C: 744/1304 [LOSS Ex1] A: 0.62126 | B: 0.60415 | C: 0.59786 [LOGITS Ex2 A] Mean Abs: 2.104 | Max: 7.336 [LOSS Ex2] A: 0.10448 | B: 0.33614 | C: 0.23011 ** [JOINT LOSS] ** : 0.831332 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009094 | Grad Max: 0.321649 -> Layer: shared_layers.0.bias | Grad Mean: 0.882024 | Grad Max: 4.122788 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002125 | Grad Max: 0.005471 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001260 | Grad Max: 0.001260 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005469 | Grad Max: 0.873132 -> Layer: exit2_layers.0.bias | Grad Mean: 0.101688 | Grad Max: 4.878801 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000539 | Grad Max: 0.017371 -> Layer: exit2_layers.3.bias | Grad Mean: 0.058375 | Grad Max: 0.302425 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000792 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011873 | Grad Max: 0.022531 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000691 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003203 | Grad Max: 0.010242 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001405 | Grad Max: 0.003581 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043776 | Grad Max: 0.043776 [GRADIENT NORM TOTAL] 19.0630 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122492 0.4877508] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 674/1374 | C: 747/1301 [LOSS Ex1] A: 0.62021 | B: 0.59966 | C: 0.59307 [LOGITS Ex2 A] Mean Abs: 2.066 | Max: 6.322 [LOSS Ex2] A: 0.11115 | B: 0.30592 | C: 0.19840 ** [JOINT LOSS] ** : 0.809473 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008144 | Grad Max: 0.302986 -> Layer: shared_layers.0.bias | Grad Mean: 0.860414 | Grad Max: 4.048113 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002234 | Grad Max: 0.005970 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002102 | Grad Max: 0.002102 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005298 | Grad Max: 0.800495 -> Layer: exit2_layers.0.bias | Grad Mean: 0.099345 | Grad Max: 4.470567 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000537 | Grad Max: 0.016462 -> Layer: exit2_layers.3.bias | Grad Mean: 0.058317 | Grad Max: 0.299827 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000791 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011596 | Grad Max: 0.023170 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000723 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003039 | Grad Max: 0.010456 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001256 | Grad Max: 0.003923 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039164 | Grad Max: 0.039164 [GRADIENT NORM TOTAL] 18.5137 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004021 0.4995979] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 626/1230 | C: 731/1317 [LOSS Ex1] A: 0.62925 | B: 0.60360 | C: 0.60335 [LOGITS Ex2 A] Mean Abs: 2.053 | Max: 5.645 [LOSS Ex2] A: 0.10408 | B: 0.31809 | C: 0.21458 ** [JOINT LOSS] ** : 0.824320 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.010114 | Grad Max: 0.302774 -> Layer: shared_layers.0.bias | Grad Mean: 0.876203 | Grad Max: 3.934770 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002019 | Grad Max: 0.005316 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004716 | Grad Max: 0.004716 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005553 | Grad Max: 0.780036 -> Layer: exit2_layers.0.bias | Grad Mean: 0.102846 | Grad Max: 4.394171 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000570 | Grad Max: 0.016756 -> Layer: exit2_layers.3.bias | Grad Mean: 0.061200 | Grad Max: 0.296207 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000083 | Grad Max: 0.000918 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012506 | Grad Max: 0.025189 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000734 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003296 | Grad Max: 0.011185 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001413 | Grad Max: 0.004109 -> Layer: exit2_layers.12.bias | Grad Mean: 0.043604 | Grad Max: 0.043604 [GRADIENT NORM TOTAL] 18.7068 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54095495 0.45904502] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 676/1372 | C: 677/1371 [LOSS Ex1] A: 0.63014 | B: 0.60391 | C: 0.60701 [LOGITS Ex2 A] Mean Abs: 2.059 | Max: 5.764 [LOSS Ex2] A: 0.10968 | B: 0.33795 | C: 0.23534 ** [JOINT LOSS] ** : 0.841344 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009497 | Grad Max: 0.339891 -> Layer: shared_layers.0.bias | Grad Mean: 0.966417 | Grad Max: 4.681437 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001971 | Grad Max: 0.005532 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011534 | Grad Max: 0.011534 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005968 | Grad Max: 0.870153 -> Layer: exit2_layers.0.bias | Grad Mean: 0.110782 | Grad Max: 4.911767 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000589 | Grad Max: 0.019101 -> Layer: exit2_layers.3.bias | Grad Mean: 0.063528 | Grad Max: 0.324871 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000084 | Grad Max: 0.000843 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012811 | Grad Max: 0.025283 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000037 | Grad Max: 0.000693 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003403 | Grad Max: 0.011291 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001440 | Grad Max: 0.003745 -> Layer: exit2_layers.12.bias | Grad Mean: 0.044983 | Grad Max: 0.044983 [GRADIENT NORM TOTAL] 20.9693 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.83318746 0.16681258] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 665/1383 | C: 480/896 [LOSS Ex1] A: 0.62344 | B: 0.60415 | C: 0.60416 [LOGITS Ex2 A] Mean Abs: 2.106 | Max: 6.967 [LOSS Ex2] A: 0.09778 | B: 0.32644 | C: 0.26855 ** [JOINT LOSS] ** : 0.841509 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008264 | Grad Max: 0.317328 -> Layer: shared_layers.0.bias | Grad Mean: 0.859963 | Grad Max: 4.169960 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002089 | Grad Max: 0.005029 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003040 | Grad Max: 0.003040 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005218 | Grad Max: 0.710026 -> Layer: exit2_layers.0.bias | Grad Mean: 0.097425 | Grad Max: 3.960751 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000524 | Grad Max: 0.015606 -> Layer: exit2_layers.3.bias | Grad Mean: 0.056694 | Grad Max: 0.298656 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000764 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011452 | Grad Max: 0.021593 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000642 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003046 | Grad Max: 0.010069 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001301 | Grad Max: 0.003430 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040722 | Grad Max: 0.040722 [GRADIENT NORM TOTAL] 18.3270 [EPOCH SUMMARY] Train Loss: 0.8253 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8121 | Alpha: 0.5500 No improve count: 10/15 ############################## EPOCH 184/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010793 0.49892068] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 674/1374 | C: 744/1304 [LOSS Ex1] A: 0.63079 | B: 0.59966 | C: 0.60197 [LOGITS Ex2 A] Mean Abs: 2.099 | Max: 6.365 [LOSS Ex2] A: 0.09877 | B: 0.31336 | C: 0.21663 ** [JOINT LOSS] ** : 0.820393 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008991 | Grad Max: 0.287714 -> Layer: shared_layers.0.bias | Grad Mean: 0.792975 | Grad Max: 3.824040 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002021 | Grad Max: 0.004775 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001921 | Grad Max: 0.001921 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004940 | Grad Max: 0.833165 -> Layer: exit2_layers.0.bias | Grad Mean: 0.091369 | Grad Max: 4.652308 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000485 | Grad Max: 0.015184 -> Layer: exit2_layers.3.bias | Grad Mean: 0.052123 | Grad Max: 0.273914 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000070 | Grad Max: 0.000775 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010683 | Grad Max: 0.020408 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000031 | Grad Max: 0.000644 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002869 | Grad Max: 0.009540 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001247 | Grad Max: 0.003620 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038416 | Grad Max: 0.038416 [GRADIENT NORM TOTAL] 17.2122 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75571054 0.24428946] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 626/1230 | C: 763/1285 [LOSS Ex1] A: 0.62568 | B: 0.60360 | C: 0.59029 [LOGITS Ex2 A] Mean Abs: 2.076 | Max: 6.722 [LOSS Ex2] A: 0.11481 | B: 0.32394 | C: 0.21180 ** [JOINT LOSS] ** : 0.823375 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011050 | Grad Max: 0.298582 -> Layer: shared_layers.0.bias | Grad Mean: 0.882616 | Grad Max: 3.877453 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002131 | Grad Max: 0.005589 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006417 | Grad Max: 0.006417 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005433 | Grad Max: 0.713624 -> Layer: exit2_layers.0.bias | Grad Mean: 0.100776 | Grad Max: 4.016646 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000558 | Grad Max: 0.017291 -> Layer: exit2_layers.3.bias | Grad Mean: 0.059524 | Grad Max: 0.307763 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000081 | Grad Max: 0.000872 -> Layer: exit2_layers.6.bias | Grad Mean: 0.012068 | Grad Max: 0.023972 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000783 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003188 | Grad Max: 0.011156 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001357 | Grad Max: 0.003831 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041661 | Grad Max: 0.041661 [GRADIENT NORM TOTAL] 18.3890 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482926 0.35170737] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 588/1028 | B: 676/1372 | C: 745/1303 [LOSS Ex1] A: 0.62377 | B: 0.60391 | C: 0.59754 [LOGITS Ex2 A] Mean Abs: 2.129 | Max: 8.737 [LOSS Ex2] A: 0.10785 | B: 0.34019 | C: 0.22295 ** [JOINT LOSS] ** : 0.832070 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.011178 | Grad Max: 0.338813 -> Layer: shared_layers.0.bias | Grad Mean: 0.964782 | Grad Max: 4.539521 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002082 | Grad Max: 0.005937 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011279 | Grad Max: 0.011279 -> Layer: exit2_layers.0.weight | Grad Mean: 0.006083 | Grad Max: 0.918166 -> Layer: exit2_layers.0.bias | Grad Mean: 0.112702 | Grad Max: 5.181048 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000615 | Grad Max: 0.018260 -> Layer: exit2_layers.3.bias | Grad Mean: 0.065721 | Grad Max: 0.324956 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000089 | Grad Max: 0.000944 -> Layer: exit2_layers.6.bias | Grad Mean: 0.013419 | Grad Max: 0.026101 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000039 | Grad Max: 0.000782 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003578 | Grad Max: 0.011257 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001561 | Grad Max: 0.003986 -> Layer: exit2_layers.12.bias | Grad Mean: 0.048001 | Grad Max: 0.048001 [GRADIENT NORM TOTAL] 20.7338 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085096 0.49149042] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 665/1383 | C: 705/1343 [LOSS Ex1] A: 0.62457 | B: 0.60415 | C: 0.59527 [LOGITS Ex2 A] Mean Abs: 2.125 | Max: 7.715 [LOSS Ex2] A: 0.09510 | B: 0.33043 | C: 0.22165 ** [JOINT LOSS] ** : 0.823726 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009874 | Grad Max: 0.293107 -> Layer: shared_layers.0.bias | Grad Mean: 0.870429 | Grad Max: 3.904039 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002087 | Grad Max: 0.005148 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005409 | Grad Max: 0.005409 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005377 | Grad Max: 0.739932 -> Layer: exit2_layers.0.bias | Grad Mean: 0.099758 | Grad Max: 4.137673 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000542 | Grad Max: 0.015974 -> Layer: exit2_layers.3.bias | Grad Mean: 0.058201 | Grad Max: 0.276935 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000078 | Grad Max: 0.000856 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011838 | Grad Max: 0.023370 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000035 | Grad Max: 0.000732 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003120 | Grad Max: 0.010771 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001367 | Grad Max: 0.003793 -> Layer: exit2_layers.12.bias | Grad Mean: 0.041595 | Grad Max: 0.041595 [GRADIENT NORM TOTAL] 18.3682 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037923 0.49620768] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.091 [MASKS] A(Pass/Fail): 717/1331 | B: 674/1374 | C: 703/1345 [LOSS Ex1] A: 0.62126 | B: 0.59966 | C: 0.60240 [LOGITS Ex2 A] Mean Abs: 2.116 | Max: 6.938 [LOSS Ex2] A: 0.10311 | B: 0.30863 | C: 0.21883 ** [JOINT LOSS] ** : 0.817963 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009300 | Grad Max: 0.291152 -> Layer: shared_layers.0.bias | Grad Mean: 0.848170 | Grad Max: 3.869596 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002163 | Grad Max: 0.005615 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000503 | Grad Max: 0.000503 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005289 | Grad Max: 0.790862 -> Layer: exit2_layers.0.bias | Grad Mean: 0.098286 | Grad Max: 4.446589 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000535 | Grad Max: 0.015500 -> Layer: exit2_layers.3.bias | Grad Mean: 0.057450 | Grad Max: 0.283179 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000077 | Grad Max: 0.000865 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011701 | Grad Max: 0.022888 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000764 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003075 | Grad Max: 0.010796 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001298 | Grad Max: 0.003892 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039861 | Grad Max: 0.039861 [GRADIENT NORM TOTAL] 18.1098 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122495 0.48775047] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 626/1230 | C: 709/1339 [LOSS Ex1] A: 0.62021 | B: 0.60360 | C: 0.59879 [LOGITS Ex2 A] Mean Abs: 2.079 | Max: 6.779 [LOSS Ex2] A: 0.11144 | B: 0.32241 | C: 0.20846 ** [JOINT LOSS] ** : 0.821636 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008192 | Grad Max: 0.285441 -> Layer: shared_layers.0.bias | Grad Mean: 0.829060 | Grad Max: 3.807547 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002246 | Grad Max: 0.006497 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006788 | Grad Max: 0.006788 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005048 | Grad Max: 0.689899 -> Layer: exit2_layers.0.bias | Grad Mean: 0.094538 | Grad Max: 3.864976 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000515 | Grad Max: 0.016663 -> Layer: exit2_layers.3.bias | Grad Mean: 0.055458 | Grad Max: 0.291885 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000824 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011077 | Grad Max: 0.022496 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000697 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002896 | Grad Max: 0.009776 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001208 | Grad Max: 0.003376 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037374 | Grad Max: 0.037374 [GRADIENT NORM TOTAL] 17.6026 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50040215 0.4995978 ] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 676/1372 | C: 714/1334 [LOSS Ex1] A: 0.62925 | B: 0.60391 | C: 0.59772 [LOGITS Ex2 A] Mean Abs: 2.071 | Max: 5.785 [LOSS Ex2] A: 0.10318 | B: 0.33035 | C: 0.19491 ** [JOINT LOSS] ** : 0.819774 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008546 | Grad Max: 0.312621 -> Layer: shared_layers.0.bias | Grad Mean: 0.871027 | Grad Max: 4.273620 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.005325 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002484 | Grad Max: 0.002484 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005442 | Grad Max: 0.820869 -> Layer: exit2_layers.0.bias | Grad Mean: 0.101493 | Grad Max: 4.609174 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000533 | Grad Max: 0.016019 -> Layer: exit2_layers.3.bias | Grad Mean: 0.057570 | Grad Max: 0.288280 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000766 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011612 | Grad Max: 0.022363 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000034 | Grad Max: 0.000620 -> Layer: exit2_layers.9.bias | Grad Mean: 0.003094 | Grad Max: 0.009891 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001299 | Grad Max: 0.003713 -> Layer: exit2_layers.12.bias | Grad Mean: 0.040608 | Grad Max: 0.040608 [GRADIENT NORM TOTAL] 19.1444 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409538 0.45904616] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 665/1383 | C: 747/1301 [LOSS Ex1] A: 0.63014 | B: 0.60415 | C: 0.59534 [LOGITS Ex2 A] Mean Abs: 2.039 | Max: 6.163 [LOSS Ex2] A: 0.10896 | B: 0.32510 | C: 0.21258 ** [JOINT LOSS] ** : 0.825425 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008352 | Grad Max: 0.288658 -> Layer: shared_layers.0.bias | Grad Mean: 0.829275 | Grad Max: 3.973768 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002055 | Grad Max: 0.005275 -> Layer: exit1_layers.0.bias | Grad Mean: 0.008060 | Grad Max: 0.008060 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005094 | Grad Max: 0.806759 -> Layer: exit2_layers.0.bias | Grad Mean: 0.095079 | Grad Max: 4.530701 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000508 | Grad Max: 0.015907 -> Layer: exit2_layers.3.bias | Grad Mean: 0.054591 | Grad Max: 0.276326 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000731 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010918 | Grad Max: 0.021503 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000563 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002869 | Grad Max: 0.009161 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001231 | Grad Max: 0.003461 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037842 | Grad Max: 0.037842 [GRADIENT NORM TOTAL] 18.0823 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8331871 0.16681288] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 674/1374 | C: 723/1325 [LOSS Ex1] A: 0.62344 | B: 0.59966 | C: 0.60588 [LOGITS Ex2 A] Mean Abs: 2.086 | Max: 6.437 [LOSS Ex2] A: 0.09875 | B: 0.30820 | C: 0.21895 ** [JOINT LOSS] ** : 0.818297 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008530 | Grad Max: 0.274018 -> Layer: shared_layers.0.bias | Grad Mean: 0.814689 | Grad Max: 3.695608 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002127 | Grad Max: 0.005475 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003419 | Grad Max: 0.003419 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005088 | Grad Max: 0.805335 -> Layer: exit2_layers.0.bias | Grad Mean: 0.094988 | Grad Max: 4.501594 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000511 | Grad Max: 0.015873 -> Layer: exit2_layers.3.bias | Grad Mean: 0.054908 | Grad Max: 0.278737 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000760 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011074 | Grad Max: 0.021765 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000651 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002915 | Grad Max: 0.009887 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001239 | Grad Max: 0.003661 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037881 | Grad Max: 0.037881 [GRADIENT NORM TOTAL] 17.7029 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50107956 0.49892047] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 626/1230 | C: 729/1319 [LOSS Ex1] A: 0.63079 | B: 0.60360 | C: 0.59939 [LOGITS Ex2 A] Mean Abs: 2.095 | Max: 6.603 [LOSS Ex2] A: 0.09596 | B: 0.31453 | C: 0.21398 ** [JOINT LOSS] ** : 0.819419 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008460 | Grad Max: 0.290586 -> Layer: shared_layers.0.bias | Grad Mean: 0.808912 | Grad Max: 3.758618 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002048 | Grad Max: 0.005221 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001659 | Grad Max: 0.001659 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005052 | Grad Max: 0.753139 -> Layer: exit2_layers.0.bias | Grad Mean: 0.093306 | Grad Max: 4.218432 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000507 | Grad Max: 0.015307 -> Layer: exit2_layers.3.bias | Grad Mean: 0.054727 | Grad Max: 0.271966 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000813 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011210 | Grad Max: 0.021969 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000681 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002964 | Grad Max: 0.010438 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001258 | Grad Max: 0.003798 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039194 | Grad Max: 0.039194 [GRADIENT NORM TOTAL] 17.3634 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7557099 0.24429008] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 676/1372 | C: 730/1318 [LOSS Ex1] A: 0.62568 | B: 0.60391 | C: 0.59883 [LOGITS Ex2 A] Mean Abs: 2.084 | Max: 6.462 [LOSS Ex2] A: 0.11144 | B: 0.33193 | C: 0.21491 ** [JOINT LOSS] ** : 0.828901 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008730 | Grad Max: 0.318571 -> Layer: shared_layers.0.bias | Grad Mean: 0.851008 | Grad Max: 4.273470 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002078 | Grad Max: 0.005786 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002484 | Grad Max: 0.002484 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005274 | Grad Max: 0.865488 -> Layer: exit2_layers.0.bias | Grad Mean: 0.097706 | Grad Max: 4.879166 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000509 | Grad Max: 0.015251 -> Layer: exit2_layers.3.bias | Grad Mean: 0.054767 | Grad Max: 0.275155 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000073 | Grad Max: 0.000798 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011085 | Grad Max: 0.022131 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000612 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002942 | Grad Max: 0.009357 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001268 | Grad Max: 0.003508 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039440 | Grad Max: 0.039440 [GRADIENT NORM TOTAL] 18.5758 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482913 0.3517087] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 588/1028 | B: 665/1383 | C: 751/1297 [LOSS Ex1] A: 0.62377 | B: 0.60415 | C: 0.59902 [LOGITS Ex2 A] Mean Abs: 2.141 | Max: 10.134 [LOSS Ex2] A: 0.10499 | B: 0.32478 | C: 0.21171 ** [JOINT LOSS] ** : 0.822808 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008795 | Grad Max: 0.281615 -> Layer: shared_layers.0.bias | Grad Mean: 0.828497 | Grad Max: 3.770032 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.005310 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007929 | Grad Max: 0.007929 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005142 | Grad Max: 0.853388 -> Layer: exit2_layers.0.bias | Grad Mean: 0.095711 | Grad Max: 4.786537 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000519 | Grad Max: 0.015904 -> Layer: exit2_layers.3.bias | Grad Mean: 0.055882 | Grad Max: 0.281425 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000075 | Grad Max: 0.000776 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011304 | Grad Max: 0.022022 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000635 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002992 | Grad Max: 0.010270 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001277 | Grad Max: 0.003782 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039679 | Grad Max: 0.039679 [GRADIENT NORM TOTAL] 17.9835 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50850946 0.49149057] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 674/1374 | C: 696/1352 [LOSS Ex1] A: 0.62457 | B: 0.59966 | C: 0.60203 [LOGITS Ex2 A] Mean Abs: 2.133 | Max: 9.391 [LOSS Ex2] A: 0.10218 | B: 0.30353 | C: 0.19787 ** [JOINT LOSS] ** : 0.809949 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007977 | Grad Max: 0.273017 -> Layer: shared_layers.0.bias | Grad Mean: 0.773398 | Grad Max: 3.735823 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.005354 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002590 | Grad Max: 0.002590 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004803 | Grad Max: 0.808509 -> Layer: exit2_layers.0.bias | Grad Mean: 0.088995 | Grad Max: 4.536067 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000476 | Grad Max: 0.014705 -> Layer: exit2_layers.3.bias | Grad Mean: 0.051467 | Grad Max: 0.267136 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000729 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010481 | Grad Max: 0.021184 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000637 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002751 | Grad Max: 0.009641 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001155 | Grad Max: 0.003579 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035716 | Grad Max: 0.035716 [GRADIENT NORM TOTAL] 16.9233 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037916 0.4962085] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.091 [MASKS] A(Pass/Fail): 717/1331 | B: 626/1230 | C: 477/899 [LOSS Ex1] A: 0.62126 | B: 0.60360 | C: 0.60310 [LOGITS Ex2 A] Mean Abs: 2.119 | Max: 6.424 [LOSS Ex2] A: 0.10003 | B: 0.31771 | C: 0.20624 ** [JOINT LOSS] ** : 0.817310 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007357 | Grad Max: 0.251871 -> Layer: shared_layers.0.bias | Grad Mean: 0.733685 | Grad Max: 3.456705 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002121 | Grad Max: 0.006051 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003288 | Grad Max: 0.003288 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004507 | Grad Max: 0.676047 -> Layer: exit2_layers.0.bias | Grad Mean: 0.083655 | Grad Max: 3.797083 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000446 | Grad Max: 0.013856 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047874 | Grad Max: 0.241111 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000064 | Grad Max: 0.000711 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009601 | Grad Max: 0.020032 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000628 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002490 | Grad Max: 0.009334 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001061 | Grad Max: 0.003363 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032292 | Grad Max: 0.032292 [GRADIENT NORM TOTAL] 15.7055 [EPOCH SUMMARY] Train Loss: 0.8215 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8088 | Alpha: 0.5500 No improve count: 11/15 ############################## EPOCH 185/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122498 0.48775017] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 676/1372 | C: 719/1329 [LOSS Ex1] A: 0.62021 | B: 0.60391 | C: 0.60558 [LOGITS Ex2 A] Mean Abs: 2.094 | Max: 5.878 [LOSS Ex2] A: 0.10780 | B: 0.32386 | C: 0.23254 ** [JOINT LOSS] ** : 0.831298 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006808 | Grad Max: 0.312183 -> Layer: shared_layers.0.bias | Grad Mean: 0.790958 | Grad Max: 4.159384 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002054 | Grad Max: 0.005798 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002165 | Grad Max: 0.002165 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004799 | Grad Max: 0.760637 -> Layer: exit2_layers.0.bias | Grad Mean: 0.089465 | Grad Max: 4.256280 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.014577 -> Layer: exit2_layers.3.bias | Grad Mean: 0.050402 | Grad Max: 0.255227 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000727 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010122 | Grad Max: 0.019404 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000593 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002715 | Grad Max: 0.008987 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001164 | Grad Max: 0.003221 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036725 | Grad Max: 0.036725 [GRADIENT NORM TOTAL] 17.3347 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004022 0.49959773] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 665/1383 | C: 776/1272 [LOSS Ex1] A: 0.62925 | B: 0.60415 | C: 0.58821 [LOGITS Ex2 A] Mean Abs: 2.052 | Max: 5.639 [LOSS Ex2] A: 0.10262 | B: 0.32421 | C: 0.22311 ** [JOINT LOSS] ** : 0.823851 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007745 | Grad Max: 0.282745 -> Layer: shared_layers.0.bias | Grad Mean: 0.773711 | Grad Max: 3.771061 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.005248 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005704 | Grad Max: 0.005704 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004791 | Grad Max: 0.762482 -> Layer: exit2_layers.0.bias | Grad Mean: 0.089488 | Grad Max: 4.248768 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000477 | Grad Max: 0.014458 -> Layer: exit2_layers.3.bias | Grad Mean: 0.051304 | Grad Max: 0.265203 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000694 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010357 | Grad Max: 0.019605 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000611 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002752 | Grad Max: 0.009382 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001194 | Grad Max: 0.003360 -> Layer: exit2_layers.12.bias | Grad Mean: 0.036924 | Grad Max: 0.036924 [GRADIENT NORM TOTAL] 16.8947 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54095274 0.45904723] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 674/1374 | C: 697/1351 [LOSS Ex1] A: 0.63014 | B: 0.59966 | C: 0.60165 [LOGITS Ex2 A] Mean Abs: 2.042 | Max: 6.701 [LOSS Ex2] A: 0.10991 | B: 0.30974 | C: 0.22223 ** [JOINT LOSS] ** : 0.824441 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009390 | Grad Max: 0.287333 -> Layer: shared_layers.0.bias | Grad Mean: 0.813973 | Grad Max: 3.720760 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002100 | Grad Max: 0.005975 -> Layer: exit1_layers.0.bias | Grad Mean: 0.012337 | Grad Max: 0.012337 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005069 | Grad Max: 0.793755 -> Layer: exit2_layers.0.bias | Grad Mean: 0.094080 | Grad Max: 4.473201 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000511 | Grad Max: 0.015251 -> Layer: exit2_layers.3.bias | Grad Mean: 0.054856 | Grad Max: 0.279427 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000074 | Grad Max: 0.000769 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011168 | Grad Max: 0.021944 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000591 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002989 | Grad Max: 0.009686 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001283 | Grad Max: 0.003601 -> Layer: exit2_layers.12.bias | Grad Mean: 0.039548 | Grad Max: 0.039548 [GRADIENT NORM TOTAL] 17.4058 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.83318686 0.1668131 ] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 626/1230 | C: 749/1299 [LOSS Ex1] A: 0.62344 | B: 0.60360 | C: 0.59908 [LOGITS Ex2 A] Mean Abs: 2.109 | Max: 7.825 [LOSS Ex2] A: 0.09484 | B: 0.31478 | C: 0.22116 ** [JOINT LOSS] ** : 0.818966 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007258 | Grad Max: 0.271672 -> Layer: shared_layers.0.bias | Grad Mean: 0.764688 | Grad Max: 3.522353 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002122 | Grad Max: 0.005682 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002555 | Grad Max: 0.002556 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004730 | Grad Max: 0.682430 -> Layer: exit2_layers.0.bias | Grad Mean: 0.088011 | Grad Max: 3.790436 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000476 | Grad Max: 0.014875 -> Layer: exit2_layers.3.bias | Grad Mean: 0.051790 | Grad Max: 0.265771 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000779 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010506 | Grad Max: 0.021811 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000651 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002776 | Grad Max: 0.009372 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001142 | Grad Max: 0.003208 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035833 | Grad Max: 0.035833 [GRADIENT NORM TOTAL] 16.3728 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010797 0.4989203] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 676/1372 | C: 709/1339 [LOSS Ex1] A: 0.63079 | B: 0.60391 | C: 0.59923 [LOGITS Ex2 A] Mean Abs: 2.112 | Max: 6.300 [LOSS Ex2] A: 0.09791 | B: 0.33356 | C: 0.21350 ** [JOINT LOSS] ** : 0.826302 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007558 | Grad Max: 0.293265 -> Layer: shared_layers.0.bias | Grad Mean: 0.778919 | Grad Max: 3.903029 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002056 | Grad Max: 0.005099 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002803 | Grad Max: 0.002803 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004835 | Grad Max: 0.792147 -> Layer: exit2_layers.0.bias | Grad Mean: 0.089235 | Grad Max: 4.426651 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000461 | Grad Max: 0.013731 -> Layer: exit2_layers.3.bias | Grad Mean: 0.049858 | Grad Max: 0.251145 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000718 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010166 | Grad Max: 0.020211 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000615 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002685 | Grad Max: 0.009240 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001134 | Grad Max: 0.003295 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035372 | Grad Max: 0.035372 [GRADIENT NORM TOTAL] 17.0767 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7557093 0.24429072] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 665/1383 | C: 754/1294 [LOSS Ex1] A: 0.62568 | B: 0.60415 | C: 0.59544 [LOGITS Ex2 A] Mean Abs: 2.071 | Max: 6.867 [LOSS Ex2] A: 0.11905 | B: 0.31919 | C: 0.20009 ** [JOINT LOSS] ** : 0.821202 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009219 | Grad Max: 0.258299 -> Layer: shared_layers.0.bias | Grad Mean: 0.761532 | Grad Max: 3.587745 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002126 | Grad Max: 0.005662 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001431 | Grad Max: 0.001431 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004714 | Grad Max: 0.731339 -> Layer: exit2_layers.0.bias | Grad Mean: 0.087542 | Grad Max: 4.125095 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000479 | Grad Max: 0.013803 -> Layer: exit2_layers.3.bias | Grad Mean: 0.050821 | Grad Max: 0.247780 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000069 | Grad Max: 0.000783 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010308 | Grad Max: 0.020624 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000566 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002718 | Grad Max: 0.009328 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001161 | Grad Max: 0.003528 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035307 | Grad Max: 0.035307 [GRADIENT NORM TOTAL] 16.1018 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482901 0.35170993] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 588/1028 | B: 674/1374 | C: 722/1326 [LOSS Ex1] A: 0.62377 | B: 0.59966 | C: 0.60018 [LOGITS Ex2 A] Mean Abs: 2.124 | Max: 7.735 [LOSS Ex2] A: 0.10622 | B: 0.30795 | C: 0.20637 ** [JOINT LOSS] ** : 0.814718 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008261 | Grad Max: 0.245013 -> Layer: shared_layers.0.bias | Grad Mean: 0.729964 | Grad Max: 3.275234 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002099 | Grad Max: 0.005598 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006041 | Grad Max: 0.006041 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004576 | Grad Max: 0.759522 -> Layer: exit2_layers.0.bias | Grad Mean: 0.084883 | Grad Max: 4.291815 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.014621 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048992 | Grad Max: 0.250512 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000750 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009937 | Grad Max: 0.019728 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000621 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002601 | Grad Max: 0.009423 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001146 | Grad Max: 0.003664 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034387 | Grad Max: 0.034387 [GRADIENT NORM TOTAL] 15.7682 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085093 0.49149072] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 626/1230 | C: 695/1353 [LOSS Ex1] A: 0.62457 | B: 0.60360 | C: 0.60210 [LOGITS Ex2 A] Mean Abs: 2.132 | Max: 7.685 [LOSS Ex2] A: 0.09767 | B: 0.31978 | C: 0.20229 ** [JOINT LOSS] ** : 0.816673 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009500 | Grad Max: 0.273171 -> Layer: shared_layers.0.bias | Grad Mean: 0.821105 | Grad Max: 3.524477 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.005921 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000284 | Grad Max: 0.000284 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005049 | Grad Max: 0.723963 -> Layer: exit2_layers.0.bias | Grad Mean: 0.093820 | Grad Max: 4.061149 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000527 | Grad Max: 0.016224 -> Layer: exit2_layers.3.bias | Grad Mean: 0.056287 | Grad Max: 0.282230 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000076 | Grad Max: 0.000885 -> Layer: exit2_layers.6.bias | Grad Mean: 0.011419 | Grad Max: 0.022761 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000033 | Grad Max: 0.000662 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002992 | Grad Max: 0.010491 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001228 | Grad Max: 0.003546 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038095 | Grad Max: 0.038095 [GRADIENT NORM TOTAL] 17.1257 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037908 0.4962092] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.091 [MASKS] A(Pass/Fail): 717/1331 | B: 676/1372 | C: 712/1336 [LOSS Ex1] A: 0.62125 | B: 0.60391 | C: 0.60011 [LOGITS Ex2 A] Mean Abs: 2.121 | Max: 6.767 [LOSS Ex2] A: 0.10073 | B: 0.33249 | C: 0.20704 ** [JOINT LOSS] ** : 0.821842 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008712 | Grad Max: 0.297094 -> Layer: shared_layers.0.bias | Grad Mean: 0.828636 | Grad Max: 4.018713 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.006347 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003052 | Grad Max: 0.003052 -> Layer: exit2_layers.0.weight | Grad Mean: 0.005138 | Grad Max: 0.804664 -> Layer: exit2_layers.0.bias | Grad Mean: 0.095404 | Grad Max: 4.533433 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000500 | Grad Max: 0.014998 -> Layer: exit2_layers.3.bias | Grad Mean: 0.053902 | Grad Max: 0.275727 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000072 | Grad Max: 0.000711 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010943 | Grad Max: 0.021223 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000032 | Grad Max: 0.000589 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002910 | Grad Max: 0.009261 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001253 | Grad Max: 0.003452 -> Layer: exit2_layers.12.bias | Grad Mean: 0.038724 | Grad Max: 0.038724 [GRADIENT NORM TOTAL] 17.8904 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122501 0.48774984] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 665/1383 | C: 710/1338 [LOSS Ex1] A: 0.62021 | B: 0.60415 | C: 0.60108 [LOGITS Ex2 A] Mean Abs: 2.081 | Max: 6.137 [LOSS Ex2] A: 0.10803 | B: 0.31987 | C: 0.20459 ** [JOINT LOSS] ** : 0.819309 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007624 | Grad Max: 0.254914 -> Layer: shared_layers.0.bias | Grad Mean: 0.724261 | Grad Max: 3.410763 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002164 | Grad Max: 0.005813 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002484 | Grad Max: 0.002484 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004415 | Grad Max: 0.752354 -> Layer: exit2_layers.0.bias | Grad Mean: 0.082261 | Grad Max: 4.208348 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000437 | Grad Max: 0.013687 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047284 | Grad Max: 0.239631 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000657 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009521 | Grad Max: 0.018448 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000629 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002503 | Grad Max: 0.008817 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001095 | Grad Max: 0.003376 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033443 | Grad Max: 0.033443 [GRADIENT NORM TOTAL] 15.6458 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50040233 0.49959767] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 674/1374 | C: 721/1327 [LOSS Ex1] A: 0.62925 | B: 0.59966 | C: 0.60220 [LOGITS Ex2 A] Mean Abs: 2.070 | Max: 5.431 [LOSS Ex2] A: 0.10096 | B: 0.30666 | C: 0.20230 ** [JOINT LOSS] ** : 0.813681 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007079 | Grad Max: 0.255372 -> Layer: shared_layers.0.bias | Grad Mean: 0.722825 | Grad Max: 3.331891 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002083 | Grad Max: 0.004967 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001774 | Grad Max: 0.001774 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004530 | Grad Max: 0.795149 -> Layer: exit2_layers.0.bias | Grad Mean: 0.084543 | Grad Max: 4.447441 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000459 | Grad Max: 0.013101 -> Layer: exit2_layers.3.bias | Grad Mean: 0.049379 | Grad Max: 0.244325 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000737 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009887 | Grad Max: 0.020467 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000615 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002574 | Grad Max: 0.009084 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001072 | Grad Max: 0.003382 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032941 | Grad Max: 0.032941 [GRADIENT NORM TOTAL] 15.7244 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409516 0.45904836] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 626/1230 | C: 732/1316 [LOSS Ex1] A: 0.63014 | B: 0.60360 | C: 0.60212 [LOGITS Ex2 A] Mean Abs: 2.058 | Max: 5.998 [LOSS Ex2] A: 0.10184 | B: 0.31149 | C: 0.20222 ** [JOINT LOSS] ** : 0.817138 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007920 | Grad Max: 0.220602 -> Layer: shared_layers.0.bias | Grad Mean: 0.660516 | Grad Max: 3.020899 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002033 | Grad Max: 0.005030 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003595 | Grad Max: 0.003595 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004167 | Grad Max: 0.640904 -> Layer: exit2_layers.0.bias | Grad Mean: 0.077184 | Grad Max: 3.613054 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000422 | Grad Max: 0.014169 -> Layer: exit2_layers.3.bias | Grad Mean: 0.044975 | Grad Max: 0.237181 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000061 | Grad Max: 0.000642 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009136 | Grad Max: 0.018177 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000515 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002396 | Grad Max: 0.008140 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001042 | Grad Max: 0.003304 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031544 | Grad Max: 0.031544 [GRADIENT NORM TOTAL] 14.2374 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8331866 0.16681337] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 676/1372 | C: 731/1317 [LOSS Ex1] A: 0.62344 | B: 0.60391 | C: 0.59132 [LOGITS Ex2 A] Mean Abs: 2.103 | Max: 6.394 [LOSS Ex2] A: 0.09649 | B: 0.32173 | C: 0.20583 ** [JOINT LOSS] ** : 0.814239 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005888 | Grad Max: 0.281947 -> Layer: shared_layers.0.bias | Grad Mean: 0.733430 | Grad Max: 3.798266 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.005572 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001524 | Grad Max: 0.001524 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004483 | Grad Max: 0.736988 -> Layer: exit2_layers.0.bias | Grad Mean: 0.083594 | Grad Max: 4.103831 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000437 | Grad Max: 0.014272 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047682 | Grad Max: 0.247768 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000695 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009581 | Grad Max: 0.019274 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000563 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002537 | Grad Max: 0.008616 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001062 | Grad Max: 0.003367 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033390 | Grad Max: 0.033390 [GRADIENT NORM TOTAL] 16.0634 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50107986 0.49892014] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 665/1383 | C: 509/867 [LOSS Ex1] A: 0.63079 | B: 0.60415 | C: 0.59742 [LOGITS Ex2 A] Mean Abs: 2.128 | Max: 5.641 [LOSS Ex2] A: 0.09308 | B: 0.31781 | C: 0.20826 ** [JOINT LOSS] ** : 0.817171 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006804 | Grad Max: 0.266654 -> Layer: shared_layers.0.bias | Grad Mean: 0.716886 | Grad Max: 3.498542 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002017 | Grad Max: 0.005027 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004928 | Grad Max: 0.004928 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004391 | Grad Max: 0.758154 -> Layer: exit2_layers.0.bias | Grad Mean: 0.081585 | Grad Max: 4.219219 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000437 | Grad Max: 0.013272 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047455 | Grad Max: 0.245340 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000667 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009586 | Grad Max: 0.018624 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000577 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002543 | Grad Max: 0.008677 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001075 | Grad Max: 0.003351 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033698 | Grad Max: 0.033698 [GRADIENT NORM TOTAL] 15.4922 [EPOCH SUMMARY] Train Loss: 0.8201 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8072 | Alpha: 0.5500 No improve count: 12/15 ############################## EPOCH 186/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75570863 0.24429138] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 674/1374 | C: 752/1296 [LOSS Ex1] A: 0.62568 | B: 0.59966 | C: 0.59366 [LOGITS Ex2 A] Mean Abs: 2.089 | Max: 6.209 [LOSS Ex2] A: 0.11229 | B: 0.30664 | C: 0.20085 ** [JOINT LOSS] ** : 0.812928 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.009215 | Grad Max: 0.249093 -> Layer: shared_layers.0.bias | Grad Mean: 0.736755 | Grad Max: 3.421594 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002193 | Grad Max: 0.005903 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009135 | Grad Max: 0.009135 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004546 | Grad Max: 0.743008 -> Layer: exit2_layers.0.bias | Grad Mean: 0.083871 | Grad Max: 4.173026 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000456 | Grad Max: 0.014493 -> Layer: exit2_layers.3.bias | Grad Mean: 0.048610 | Grad Max: 0.237567 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000693 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009888 | Grad Max: 0.019974 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000583 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002570 | Grad Max: 0.009345 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001092 | Grad Max: 0.003542 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033085 | Grad Max: 0.033085 [GRADIENT NORM TOTAL] 15.5858 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64828885 0.35171115] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 588/1028 | B: 626/1230 | C: 721/1327 [LOSS Ex1] A: 0.62377 | B: 0.60360 | C: 0.60265 [LOGITS Ex2 A] Mean Abs: 2.147 | Max: 8.310 [LOSS Ex2] A: 0.10239 | B: 0.30688 | C: 0.22714 ** [JOINT LOSS] ** : 0.822143 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008234 | Grad Max: 0.248137 -> Layer: shared_layers.0.bias | Grad Mean: 0.731031 | Grad Max: 3.140438 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002063 | Grad Max: 0.006046 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000535 | Grad Max: 0.000535 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004490 | Grad Max: 0.679851 -> Layer: exit2_layers.0.bias | Grad Mean: 0.083414 | Grad Max: 3.803739 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000470 | Grad Max: 0.014205 -> Layer: exit2_layers.3.bias | Grad Mean: 0.050577 | Grad Max: 0.248081 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000770 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010260 | Grad Max: 0.019918 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000609 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002716 | Grad Max: 0.009604 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001159 | Grad Max: 0.003477 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035926 | Grad Max: 0.035926 [GRADIENT NORM TOTAL] 15.3138 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50850916 0.49149087] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 676/1372 | C: 715/1333 [LOSS Ex1] A: 0.62457 | B: 0.60391 | C: 0.60173 [LOGITS Ex2 A] Mean Abs: 2.125 | Max: 7.692 [LOSS Ex2] A: 0.09388 | B: 0.32933 | C: 0.21434 ** [JOINT LOSS] ** : 0.822588 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008552 | Grad Max: 0.287059 -> Layer: shared_layers.0.bias | Grad Mean: 0.777928 | Grad Max: 3.786120 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002064 | Grad Max: 0.005683 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000021 | Grad Max: 0.000021 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004813 | Grad Max: 0.833840 -> Layer: exit2_layers.0.bias | Grad Mean: 0.088839 | Grad Max: 4.678837 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000473 | Grad Max: 0.013296 -> Layer: exit2_layers.3.bias | Grad Mean: 0.050716 | Grad Max: 0.243041 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000068 | Grad Max: 0.000706 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010369 | Grad Max: 0.019796 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000578 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002796 | Grad Max: 0.008769 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001206 | Grad Max: 0.003428 -> Layer: exit2_layers.12.bias | Grad Mean: 0.037824 | Grad Max: 0.037824 [GRADIENT NORM TOTAL] 16.9204 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50379 0.49620998] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.091 [MASKS] A(Pass/Fail): 717/1331 | B: 665/1383 | C: 712/1336 [LOSS Ex1] A: 0.62125 | B: 0.60415 | C: 0.59836 [LOGITS Ex2 A] Mean Abs: 2.113 | Max: 5.921 [LOSS Ex2] A: 0.10175 | B: 0.32991 | C: 0.21115 ** [JOINT LOSS] ** : 0.822193 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007644 | Grad Max: 0.236983 -> Layer: shared_layers.0.bias | Grad Mean: 0.701715 | Grad Max: 3.239472 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002148 | Grad Max: 0.006021 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002351 | Grad Max: 0.002351 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004315 | Grad Max: 0.672996 -> Layer: exit2_layers.0.bias | Grad Mean: 0.080252 | Grad Max: 3.783895 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000430 | Grad Max: 0.012625 -> Layer: exit2_layers.3.bias | Grad Mean: 0.046359 | Grad Max: 0.224851 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000062 | Grad Max: 0.000689 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009326 | Grad Max: 0.018756 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000027 | Grad Max: 0.000549 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002442 | Grad Max: 0.008557 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001057 | Grad Max: 0.003444 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032255 | Grad Max: 0.032255 [GRADIENT NORM TOTAL] 14.9737 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122504 0.48774958] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 674/1374 | C: 744/1304 [LOSS Ex1] A: 0.62021 | B: 0.59966 | C: 0.59622 [LOGITS Ex2 A] Mean Abs: 2.096 | Max: 7.004 [LOSS Ex2] A: 0.11645 | B: 0.29883 | C: 0.20720 ** [JOINT LOSS] ** : 0.812855 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007275 | Grad Max: 0.243145 -> Layer: shared_layers.0.bias | Grad Mean: 0.680541 | Grad Max: 3.195089 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002217 | Grad Max: 0.005715 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002225 | Grad Max: 0.002225 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004151 | Grad Max: 0.690252 -> Layer: exit2_layers.0.bias | Grad Mean: 0.077438 | Grad Max: 3.873073 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000414 | Grad Max: 0.012629 -> Layer: exit2_layers.3.bias | Grad Mean: 0.044472 | Grad Max: 0.227292 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000655 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008908 | Grad Max: 0.018502 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000550 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002356 | Grad Max: 0.008177 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001037 | Grad Max: 0.003300 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031408 | Grad Max: 0.031408 [GRADIENT NORM TOTAL] 14.4741 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004024 0.4995976] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 626/1230 | C: 745/1303 [LOSS Ex1] A: 0.62925 | B: 0.60360 | C: 0.59481 [LOGITS Ex2 A] Mean Abs: 2.080 | Max: 5.138 [LOSS Ex2] A: 0.10065 | B: 0.30494 | C: 0.22550 ** [JOINT LOSS] ** : 0.819583 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007313 | Grad Max: 0.254235 -> Layer: shared_layers.0.bias | Grad Mean: 0.704891 | Grad Max: 3.300234 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.005218 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004288 | Grad Max: 0.004288 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004445 | Grad Max: 0.721949 -> Layer: exit2_layers.0.bias | Grad Mean: 0.082187 | Grad Max: 4.053629 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000441 | Grad Max: 0.013652 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047449 | Grad Max: 0.235260 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000722 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009621 | Grad Max: 0.018996 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000566 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002511 | Grad Max: 0.008592 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001064 | Grad Max: 0.003227 -> Layer: exit2_layers.12.bias | Grad Mean: 0.033060 | Grad Max: 0.033060 [GRADIENT NORM TOTAL] 15.3040 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409505 0.4590495] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 676/1372 | C: 762/1286 [LOSS Ex1] A: 0.63014 | B: 0.60391 | C: 0.59910 [LOGITS Ex2 A] Mean Abs: 2.051 | Max: 5.663 [LOSS Ex2] A: 0.10653 | B: 0.32336 | C: 0.20726 ** [JOINT LOSS] ** : 0.823429 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.008152 | Grad Max: 0.271708 -> Layer: shared_layers.0.bias | Grad Mean: 0.754435 | Grad Max: 3.694061 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002015 | Grad Max: 0.005473 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006073 | Grad Max: 0.006073 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004744 | Grad Max: 0.784414 -> Layer: exit2_layers.0.bias | Grad Mean: 0.088367 | Grad Max: 4.416069 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000468 | Grad Max: 0.013953 -> Layer: exit2_layers.3.bias | Grad Mean: 0.050398 | Grad Max: 0.257832 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000067 | Grad Max: 0.000710 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010179 | Grad Max: 0.020912 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000030 | Grad Max: 0.000567 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002677 | Grad Max: 0.009195 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001135 | Grad Max: 0.003615 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034872 | Grad Max: 0.034872 [GRADIENT NORM TOTAL] 16.4829 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.83318645 0.16681357] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 665/1383 | C: 736/1312 [LOSS Ex1] A: 0.62344 | B: 0.60415 | C: 0.59299 [LOGITS Ex2 A] Mean Abs: 2.116 | Max: 7.133 [LOSS Ex2] A: 0.09796 | B: 0.31623 | C: 0.19974 ** [JOINT LOSS] ** : 0.811505 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005782 | Grad Max: 0.249808 -> Layer: shared_layers.0.bias | Grad Mean: 0.684816 | Grad Max: 3.311555 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002176 | Grad Max: 0.005693 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004251 | Grad Max: 0.004251 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004136 | Grad Max: 0.671266 -> Layer: exit2_layers.0.bias | Grad Mean: 0.077172 | Grad Max: 3.748067 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000414 | Grad Max: 0.014116 -> Layer: exit2_layers.3.bias | Grad Mean: 0.045202 | Grad Max: 0.237946 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000720 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009140 | Grad Max: 0.020477 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000516 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002396 | Grad Max: 0.008465 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001001 | Grad Max: 0.003340 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031414 | Grad Max: 0.031414 [GRADIENT NORM TOTAL] 14.6074 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50108004 0.49891996] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 674/1374 | C: 713/1335 [LOSS Ex1] A: 0.63079 | B: 0.59966 | C: 0.60383 [LOGITS Ex2 A] Mean Abs: 2.120 | Max: 6.693 [LOSS Ex2] A: 0.09231 | B: 0.29991 | C: 0.21015 ** [JOINT LOSS] ** : 0.812217 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006832 | Grad Max: 0.204160 -> Layer: shared_layers.0.bias | Grad Mean: 0.580316 | Grad Max: 2.756455 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.005001 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004024 | Grad Max: 0.004024 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003657 | Grad Max: 0.593210 -> Layer: exit2_layers.0.bias | Grad Mean: 0.066935 | Grad Max: 3.295705 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000356 | Grad Max: 0.010587 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037935 | Grad Max: 0.180622 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000615 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007784 | Grad Max: 0.016342 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000474 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002025 | Grad Max: 0.007103 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000875 | Grad Max: 0.003072 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026428 | Grad Max: 0.026428 [GRADIENT NORM TOTAL] 12.5192 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7557081 0.24429193] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 626/1230 | C: 709/1339 [LOSS Ex1] A: 0.62568 | B: 0.60360 | C: 0.60208 [LOGITS Ex2 A] Mean Abs: 2.089 | Max: 7.077 [LOSS Ex2] A: 0.11557 | B: 0.30712 | C: 0.19348 ** [JOINT LOSS] ** : 0.815846 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007316 | Grad Max: 0.222380 -> Layer: shared_layers.0.bias | Grad Mean: 0.651993 | Grad Max: 2.974643 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002047 | Grad Max: 0.005507 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003965 | Grad Max: 0.003965 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004042 | Grad Max: 0.604047 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074818 | Grad Max: 3.407891 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000407 | Grad Max: 0.012224 -> Layer: exit2_layers.3.bias | Grad Mean: 0.043725 | Grad Max: 0.218726 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000664 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008825 | Grad Max: 0.017529 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000520 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002295 | Grad Max: 0.007880 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000958 | Grad Max: 0.003335 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029165 | Grad Max: 0.029165 [GRADIENT NORM TOTAL] 13.8109 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64828753 0.35171247] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 588/1028 | B: 676/1372 | C: 706/1342 [LOSS Ex1] A: 0.62377 | B: 0.60391 | C: 0.60284 [LOGITS Ex2 A] Mean Abs: 2.145 | Max: 8.353 [LOSS Ex2] A: 0.09809 | B: 0.32066 | C: 0.20931 ** [JOINT LOSS] ** : 0.819525 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007240 | Grad Max: 0.266877 -> Layer: shared_layers.0.bias | Grad Mean: 0.743254 | Grad Max: 3.640386 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002106 | Grad Max: 0.006329 -> Layer: exit1_layers.0.bias | Grad Mean: 0.014988 | Grad Max: 0.014988 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004628 | Grad Max: 0.775777 -> Layer: exit2_layers.0.bias | Grad Mean: 0.085550 | Grad Max: 4.353660 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000452 | Grad Max: 0.013784 -> Layer: exit2_layers.3.bias | Grad Mean: 0.049233 | Grad Max: 0.253784 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000065 | Grad Max: 0.000726 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010011 | Grad Max: 0.020852 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000621 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002666 | Grad Max: 0.009049 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001125 | Grad Max: 0.003446 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035004 | Grad Max: 0.035004 [GRADIENT NORM TOTAL] 16.4500 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.508509 0.491491] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 665/1383 | C: 724/1324 [LOSS Ex1] A: 0.62457 | B: 0.60415 | C: 0.59969 [LOGITS Ex2 A] Mean Abs: 2.139 | Max: 7.236 [LOSS Ex2] A: 0.09652 | B: 0.32446 | C: 0.20367 ** [JOINT LOSS] ** : 0.817688 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007322 | Grad Max: 0.237742 -> Layer: shared_layers.0.bias | Grad Mean: 0.663928 | Grad Max: 3.201676 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002112 | Grad Max: 0.005558 -> Layer: exit1_layers.0.bias | Grad Mean: 0.003695 | Grad Max: 0.003695 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004076 | Grad Max: 0.715225 -> Layer: exit2_layers.0.bias | Grad Mean: 0.075610 | Grad Max: 4.019011 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000404 | Grad Max: 0.012044 -> Layer: exit2_layers.3.bias | Grad Mean: 0.043803 | Grad Max: 0.220571 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000644 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008913 | Grad Max: 0.017674 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000527 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002328 | Grad Max: 0.007940 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000985 | Grad Max: 0.002994 -> Layer: exit2_layers.12.bias | Grad Mean: 0.030500 | Grad Max: 0.030500 [GRADIENT NORM TOTAL] 14.2362 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50378925 0.49621078] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.091 [MASKS] A(Pass/Fail): 717/1331 | B: 674/1374 | C: 709/1339 [LOSS Ex1] A: 0.62125 | B: 0.59966 | C: 0.60115 [LOGITS Ex2 A] Mean Abs: 2.120 | Max: 6.887 [LOSS Ex2] A: 0.10576 | B: 0.29593 | C: 0.20663 ** [JOINT LOSS] ** : 0.810133 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005275 | Grad Max: 0.213051 -> Layer: shared_layers.0.bias | Grad Mean: 0.600426 | Grad Max: 2.848289 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002173 | Grad Max: 0.005792 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001222 | Grad Max: 0.001222 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003690 | Grad Max: 0.653376 -> Layer: exit2_layers.0.bias | Grad Mean: 0.068768 | Grad Max: 3.662163 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000365 | Grad Max: 0.011721 -> Layer: exit2_layers.3.bias | Grad Mean: 0.039951 | Grad Max: 0.217400 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000575 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007999 | Grad Max: 0.016165 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000440 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002082 | Grad Max: 0.006442 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000878 | Grad Max: 0.002832 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027127 | Grad Max: 0.027127 [GRADIENT NORM TOTAL] 13.0583 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122507 0.48774925] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 626/1230 | C: 488/888 [LOSS Ex1] A: 0.62021 | B: 0.60360 | C: 0.59620 [LOGITS Ex2 A] Mean Abs: 2.083 | Max: 6.342 [LOSS Ex2] A: 0.10303 | B: 0.30197 | C: 0.23641 ** [JOINT LOSS] ** : 0.820472 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005335 | Grad Max: 0.246352 -> Layer: shared_layers.0.bias | Grad Mean: 0.660678 | Grad Max: 3.122468 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002159 | Grad Max: 0.005989 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002978 | Grad Max: 0.002978 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003984 | Grad Max: 0.639311 -> Layer: exit2_layers.0.bias | Grad Mean: 0.074330 | Grad Max: 3.573451 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000406 | Grad Max: 0.013269 -> Layer: exit2_layers.3.bias | Grad Mean: 0.044282 | Grad Max: 0.211338 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000649 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008800 | Grad Max: 0.017618 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000513 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002339 | Grad Max: 0.007898 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000994 | Grad Max: 0.003097 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031421 | Grad Max: 0.031421 [GRADIENT NORM TOTAL] 14.1321 [EPOCH SUMMARY] Train Loss: 0.8174 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8045 | Alpha: 0.5500 No improve count: 13/15 ############################## EPOCH 187/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50040245 0.49959752] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 676/1372 | C: 754/1294 [LOSS Ex1] A: 0.62925 | B: 0.60391 | C: 0.59522 [LOGITS Ex2 A] Mean Abs: 2.086 | Max: 5.697 [LOSS Ex2] A: 0.09964 | B: 0.32939 | C: 0.22413 ** [JOINT LOSS] ** : 0.827183 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007300 | Grad Max: 0.264881 -> Layer: shared_layers.0.bias | Grad Mean: 0.723565 | Grad Max: 3.551191 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002071 | Grad Max: 0.005518 -> Layer: exit1_layers.0.bias | Grad Mean: 0.006683 | Grad Max: 0.006683 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004551 | Grad Max: 0.751902 -> Layer: exit2_layers.0.bias | Grad Mean: 0.084384 | Grad Max: 4.219884 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000457 | Grad Max: 0.014493 -> Layer: exit2_layers.3.bias | Grad Mean: 0.049474 | Grad Max: 0.251089 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000066 | Grad Max: 0.000739 -> Layer: exit2_layers.6.bias | Grad Mean: 0.010074 | Grad Max: 0.020373 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000029 | Grad Max: 0.000569 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002656 | Grad Max: 0.008772 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001142 | Grad Max: 0.003258 -> Layer: exit2_layers.12.bias | Grad Mean: 0.035329 | Grad Max: 0.035329 [GRADIENT NORM TOTAL] 15.7346 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409494 0.45905066] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 665/1383 | C: 739/1309 [LOSS Ex1] A: 0.63013 | B: 0.60415 | C: 0.60167 [LOGITS Ex2 A] Mean Abs: 2.068 | Max: 5.705 [LOSS Ex2] A: 0.11074 | B: 0.32124 | C: 0.21821 ** [JOINT LOSS] ** : 0.828712 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007747 | Grad Max: 0.241281 -> Layer: shared_layers.0.bias | Grad Mean: 0.686280 | Grad Max: 3.121789 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001985 | Grad Max: 0.005350 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010715 | Grad Max: 0.010715 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004277 | Grad Max: 0.734749 -> Layer: exit2_layers.0.bias | Grad Mean: 0.079714 | Grad Max: 4.119785 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000439 | Grad Max: 0.012909 -> Layer: exit2_layers.3.bias | Grad Mean: 0.047084 | Grad Max: 0.234606 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000063 | Grad Max: 0.000697 -> Layer: exit2_layers.6.bias | Grad Mean: 0.009546 | Grad Max: 0.018449 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000028 | Grad Max: 0.000562 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002546 | Grad Max: 0.008901 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001095 | Grad Max: 0.003368 -> Layer: exit2_layers.12.bias | Grad Mean: 0.034013 | Grad Max: 0.034013 [GRADIENT NORM TOTAL] 14.7542 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8331862 0.16681375] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 674/1374 | C: 703/1345 [LOSS Ex1] A: 0.62344 | B: 0.59966 | C: 0.59788 [LOGITS Ex2 A] Mean Abs: 2.114 | Max: 7.152 [LOSS Ex2] A: 0.09897 | B: 0.29826 | C: 0.20539 ** [JOINT LOSS] ** : 0.807868 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005243 | Grad Max: 0.190491 -> Layer: shared_layers.0.bias | Grad Mean: 0.523630 | Grad Max: 2.552562 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002205 | Grad Max: 0.005483 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005098 | Grad Max: 0.005098 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003278 | Grad Max: 0.401644 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060591 | Grad Max: 2.244462 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000331 | Grad Max: 0.010133 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035415 | Grad Max: 0.176780 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000581 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007214 | Grad Max: 0.015019 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000496 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001877 | Grad Max: 0.006921 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000807 | Grad Max: 0.002931 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024755 | Grad Max: 0.024755 [GRADIENT NORM TOTAL] 11.1647 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010802 0.4989198] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 626/1230 | C: 730/1318 [LOSS Ex1] A: 0.63079 | B: 0.60360 | C: 0.59409 [LOGITS Ex2 A] Mean Abs: 2.123 | Max: 6.507 [LOSS Ex2] A: 0.08839 | B: 0.30794 | C: 0.21152 ** [JOINT LOSS] ** : 0.812109 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007000 | Grad Max: 0.205359 -> Layer: shared_layers.0.bias | Grad Mean: 0.602296 | Grad Max: 2.669979 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002072 | Grad Max: 0.005491 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000567 | Grad Max: 0.000567 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003788 | Grad Max: 0.423704 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070069 | Grad Max: 2.379806 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000399 | Grad Max: 0.012963 -> Layer: exit2_layers.3.bias | Grad Mean: 0.042723 | Grad Max: 0.222214 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000667 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008697 | Grad Max: 0.017987 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000600 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002273 | Grad Max: 0.008517 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000940 | Grad Max: 0.003195 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029250 | Grad Max: 0.029250 [GRADIENT NORM TOTAL] 12.4675 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.75570756 0.24429241] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 676/1372 | C: 739/1309 [LOSS Ex1] A: 0.62568 | B: 0.60391 | C: 0.60101 [LOGITS Ex2 A] Mean Abs: 2.095 | Max: 7.248 [LOSS Ex2] A: 0.10985 | B: 0.32379 | C: 0.20577 ** [JOINT LOSS] ** : 0.823335 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006824 | Grad Max: 0.220128 -> Layer: shared_layers.0.bias | Grad Mean: 0.609092 | Grad Max: 2.872821 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002098 | Grad Max: 0.005753 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000791 | Grad Max: 0.000791 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003818 | Grad Max: 0.497031 -> Layer: exit2_layers.0.bias | Grad Mean: 0.070667 | Grad Max: 2.814669 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000379 | Grad Max: 0.012022 -> Layer: exit2_layers.3.bias | Grad Mean: 0.040772 | Grad Max: 0.222001 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000604 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008220 | Grad Max: 0.016178 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000489 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002176 | Grad Max: 0.007657 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000926 | Grad Max: 0.003001 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028641 | Grad Max: 0.028641 [GRADIENT NORM TOTAL] 13.0604 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.64828634 0.35171366] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 588/1028 | B: 665/1383 | C: 691/1357 [LOSS Ex1] A: 0.62377 | B: 0.60415 | C: 0.60331 [LOGITS Ex2 A] Mean Abs: 2.138 | Max: 10.286 [LOSS Ex2] A: 0.10287 | B: 0.32022 | C: 0.18909 ** [JOINT LOSS] ** : 0.814472 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006498 | Grad Max: 0.185110 -> Layer: shared_layers.0.bias | Grad Mean: 0.569787 | Grad Max: 2.529488 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002062 | Grad Max: 0.005526 -> Layer: exit1_layers.0.bias | Grad Mean: 0.001413 | Grad Max: 0.001413 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003526 | Grad Max: 0.504322 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065560 | Grad Max: 2.794327 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000359 | Grad Max: 0.011176 -> Layer: exit2_layers.3.bias | Grad Mean: 0.038309 | Grad Max: 0.206929 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000052 | Grad Max: 0.000651 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007665 | Grad Max: 0.016801 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000023 | Grad Max: 0.000474 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001974 | Grad Max: 0.007368 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000855 | Grad Max: 0.003187 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025451 | Grad Max: 0.025451 [GRADIENT NORM TOTAL] 12.0643 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50850886 0.49149117] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 674/1374 | C: 753/1295 [LOSS Ex1] A: 0.62457 | B: 0.59966 | C: 0.59584 [LOGITS Ex2 A] Mean Abs: 2.118 | Max: 8.322 [LOSS Ex2] A: 0.09255 | B: 0.29619 | C: 0.20587 ** [JOINT LOSS] ** : 0.804894 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007387 | Grad Max: 0.193063 -> Layer: shared_layers.0.bias | Grad Mean: 0.597565 | Grad Max: 2.635975 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002203 | Grad Max: 0.005884 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004431 | Grad Max: 0.004431 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003735 | Grad Max: 0.501850 -> Layer: exit2_layers.0.bias | Grad Mean: 0.069052 | Grad Max: 2.781194 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000386 | Grad Max: 0.012679 -> Layer: exit2_layers.3.bias | Grad Mean: 0.041077 | Grad Max: 0.206180 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000056 | Grad Max: 0.000667 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008379 | Grad Max: 0.017251 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000502 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002166 | Grad Max: 0.007908 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000952 | Grad Max: 0.003337 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028343 | Grad Max: 0.028343 [GRADIENT NORM TOTAL] 12.4876 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037884 0.49621156] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.091 [MASKS] A(Pass/Fail): 717/1331 | B: 626/1230 | C: 711/1337 [LOSS Ex1] A: 0.62125 | B: 0.60360 | C: 0.60307 [LOGITS Ex2 A] Mean Abs: 2.123 | Max: 7.352 [LOSS Ex2] A: 0.10192 | B: 0.30456 | C: 0.21149 ** [JOINT LOSS] ** : 0.815301 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005266 | Grad Max: 0.199132 -> Layer: shared_layers.0.bias | Grad Mean: 0.580873 | Grad Max: 2.610063 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002155 | Grad Max: 0.006165 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004793 | Grad Max: 0.004793 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003638 | Grad Max: 0.483207 -> Layer: exit2_layers.0.bias | Grad Mean: 0.067513 | Grad Max: 2.680828 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000375 | Grad Max: 0.012403 -> Layer: exit2_layers.3.bias | Grad Mean: 0.040537 | Grad Max: 0.208722 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000599 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008162 | Grad Max: 0.016822 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000506 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002141 | Grad Max: 0.007557 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000927 | Grad Max: 0.003054 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028249 | Grad Max: 0.028249 [GRADIENT NORM TOTAL] 12.2954 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5122511 0.48774892] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 676/1372 | C: 707/1341 [LOSS Ex1] A: 0.62021 | B: 0.60391 | C: 0.60799 [LOGITS Ex2 A] Mean Abs: 2.093 | Max: 7.241 [LOSS Ex2] A: 0.10672 | B: 0.31592 | C: 0.22788 ** [JOINT LOSS] ** : 0.827541 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007031 | Grad Max: 0.238829 -> Layer: shared_layers.0.bias | Grad Mean: 0.655340 | Grad Max: 2.972092 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002057 | Grad Max: 0.005457 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005935 | Grad Max: 0.005935 -> Layer: exit2_layers.0.weight | Grad Mean: 0.004018 | Grad Max: 0.550272 -> Layer: exit2_layers.0.bias | Grad Mean: 0.075057 | Grad Max: 3.025749 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000415 | Grad Max: 0.014744 -> Layer: exit2_layers.3.bias | Grad Mean: 0.044319 | Grad Max: 0.253060 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000059 | Grad Max: 0.000687 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008868 | Grad Max: 0.018604 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000549 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002377 | Grad Max: 0.007745 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001046 | Grad Max: 0.003214 -> Layer: exit2_layers.12.bias | Grad Mean: 0.032096 | Grad Max: 0.032096 [GRADIENT NORM TOTAL] 13.6991 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50040257 0.49959743] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 665/1383 | C: 708/1340 [LOSS Ex1] A: 0.62925 | B: 0.60415 | C: 0.60111 [LOGITS Ex2 A] Mean Abs: 2.088 | Max: 6.026 [LOSS Ex2] A: 0.10074 | B: 0.30989 | C: 0.21404 ** [JOINT LOSS] ** : 0.819728 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006873 | Grad Max: 0.221591 -> Layer: shared_layers.0.bias | Grad Mean: 0.615363 | Grad Max: 2.788455 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002029 | Grad Max: 0.006019 -> Layer: exit1_layers.0.bias | Grad Mean: 0.010675 | Grad Max: 0.010675 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003868 | Grad Max: 0.507046 -> Layer: exit2_layers.0.bias | Grad Mean: 0.071831 | Grad Max: 2.787810 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000404 | Grad Max: 0.011243 -> Layer: exit2_layers.3.bias | Grad Mean: 0.043227 | Grad Max: 0.200154 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000058 | Grad Max: 0.000643 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008802 | Grad Max: 0.017510 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000026 | Grad Max: 0.000494 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002341 | Grad Max: 0.007526 -> Layer: exit2_layers.12.weight | Grad Mean: 0.001003 | Grad Max: 0.003234 -> Layer: exit2_layers.12.bias | Grad Mean: 0.031095 | Grad Max: 0.031095 [GRADIENT NORM TOTAL] 12.9683 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5409482 0.45905182] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 674/1374 | C: 735/1313 [LOSS Ex1] A: 0.63013 | B: 0.59966 | C: 0.60384 [LOGITS Ex2 A] Mean Abs: 2.071 | Max: 5.945 [LOSS Ex2] A: 0.10455 | B: 0.29666 | C: 0.20657 ** [JOINT LOSS] ** : 0.813806 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006130 | Grad Max: 0.166371 -> Layer: shared_layers.0.bias | Grad Mean: 0.537527 | Grad Max: 2.254179 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002014 | Grad Max: 0.005388 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004914 | Grad Max: 0.004914 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003316 | Grad Max: 0.417892 -> Layer: exit2_layers.0.bias | Grad Mean: 0.061524 | Grad Max: 2.326216 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000344 | Grad Max: 0.010598 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037003 | Grad Max: 0.190425 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000050 | Grad Max: 0.000589 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007431 | Grad Max: 0.015419 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000413 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001921 | Grad Max: 0.006258 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000823 | Grad Max: 0.002884 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024751 | Grad Max: 0.024751 [GRADIENT NORM TOTAL] 11.1796 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.83318603 0.16681394] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 626/1230 | C: 718/1330 [LOSS Ex1] A: 0.62344 | B: 0.60360 | C: 0.59464 [LOGITS Ex2 A] Mean Abs: 2.114 | Max: 7.713 [LOSS Ex2] A: 0.09695 | B: 0.29642 | C: 0.21170 ** [JOINT LOSS] ** : 0.808918 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004194 | Grad Max: 0.184553 -> Layer: shared_layers.0.bias | Grad Mean: 0.528907 | Grad Max: 2.452946 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002212 | Grad Max: 0.005982 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007160 | Grad Max: 0.007160 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003239 | Grad Max: 0.432457 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060326 | Grad Max: 2.373373 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000332 | Grad Max: 0.011167 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036330 | Grad Max: 0.192008 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000546 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007179 | Grad Max: 0.015017 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000453 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001842 | Grad Max: 0.007045 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000762 | Grad Max: 0.002906 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023441 | Grad Max: 0.023441 [GRADIENT NORM TOTAL] 11.1788 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50108045 0.49891958] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 676/1372 | C: 727/1321 [LOSS Ex1] A: 0.63079 | B: 0.60391 | C: 0.59329 [LOGITS Ex2 A] Mean Abs: 2.108 | Max: 6.596 [LOSS Ex2] A: 0.09495 | B: 0.32000 | C: 0.21359 ** [JOINT LOSS] ** : 0.818838 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005528 | Grad Max: 0.220326 -> Layer: shared_layers.0.bias | Grad Mean: 0.548176 | Grad Max: 2.900694 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002066 | Grad Max: 0.005136 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000025 | Grad Max: 0.000025 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003452 | Grad Max: 0.526271 -> Layer: exit2_layers.0.bias | Grad Mean: 0.063315 | Grad Max: 2.898128 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000326 | Grad Max: 0.011181 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035536 | Grad Max: 0.193536 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000566 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007313 | Grad Max: 0.015642 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000446 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001948 | Grad Max: 0.007061 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000809 | Grad Max: 0.002902 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025550 | Grad Max: 0.025550 [GRADIENT NORM TOTAL] 12.0014 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.755707 0.24429302] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 665/1383 | C: 521/855 [LOSS Ex1] A: 0.62568 | B: 0.60415 | C: 0.59047 [LOGITS Ex2 A] Mean Abs: 2.089 | Max: 6.754 [LOSS Ex2] A: 0.11416 | B: 0.30940 | C: 0.20412 ** [JOINT LOSS] ** : 0.815996 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007558 | Grad Max: 0.201804 -> Layer: shared_layers.0.bias | Grad Mean: 0.590470 | Grad Max: 2.553873 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002091 | Grad Max: 0.005789 -> Layer: exit1_layers.0.bias | Grad Mean: 0.009872 | Grad Max: 0.009872 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003695 | Grad Max: 0.424773 -> Layer: exit2_layers.0.bias | Grad Mean: 0.068390 | Grad Max: 2.312513 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000393 | Grad Max: 0.013079 -> Layer: exit2_layers.3.bias | Grad Mean: 0.041873 | Grad Max: 0.227383 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000057 | Grad Max: 0.000633 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008448 | Grad Max: 0.017121 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000025 | Grad Max: 0.000506 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002226 | Grad Max: 0.007350 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000987 | Grad Max: 0.003176 -> Layer: exit2_layers.12.bias | Grad Mean: 0.029757 | Grad Max: 0.029757 [GRADIENT NORM TOTAL] 12.0736 [EPOCH SUMMARY] Train Loss: 0.8171 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8030 | Alpha: 0.5500 No improve count: 14/15 ############################## EPOCH 188/500 START ############################## >>> [TRAIN] BATCH 0 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482851 0.35171497] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 588/1028 | B: 674/1374 | C: 717/1331 [LOSS Ex1] A: 0.62377 | B: 0.59966 | C: 0.59829 [LOGITS Ex2 A] Mean Abs: 2.147 | Max: 8.416 [LOSS Ex2] A: 0.09621 | B: 0.29634 | C: 0.20370 ** [JOINT LOSS] ** : 0.805992 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.007065 | Grad Max: 0.189758 -> Layer: shared_layers.0.bias | Grad Mean: 0.565639 | Grad Max: 2.543622 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.005998 -> Layer: exit1_layers.0.bias | Grad Mean: 0.011180 | Grad Max: 0.011180 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003577 | Grad Max: 0.471538 -> Layer: exit2_layers.0.bias | Grad Mean: 0.066321 | Grad Max: 2.610998 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000378 | Grad Max: 0.011868 -> Layer: exit2_layers.3.bias | Grad Mean: 0.040288 | Grad Max: 0.203090 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000054 | Grad Max: 0.000603 -> Layer: exit2_layers.6.bias | Grad Mean: 0.008188 | Grad Max: 0.016632 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000024 | Grad Max: 0.000454 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002144 | Grad Max: 0.007260 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000944 | Grad Max: 0.003294 -> Layer: exit2_layers.12.bias | Grad Mean: 0.028444 | Grad Max: 0.028444 [GRADIENT NORM TOTAL] 11.9788 >>> [TRAIN] BATCH 1 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5085088 0.49149123] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 626/1230 | C: 733/1315 [LOSS Ex1] A: 0.62457 | B: 0.60360 | C: 0.60227 [LOGITS Ex2 A] Mean Abs: 2.148 | Max: 8.874 [LOSS Ex2] A: 0.08544 | B: 0.30428 | C: 0.22550 ** [JOINT LOSS] ** : 0.815224 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006319 | Grad Max: 0.181624 -> Layer: shared_layers.0.bias | Grad Mean: 0.544914 | Grad Max: 2.342465 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002095 | Grad Max: 0.005563 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002376 | Grad Max: 0.002376 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003374 | Grad Max: 0.382795 -> Layer: exit2_layers.0.bias | Grad Mean: 0.062499 | Grad Max: 2.120625 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000351 | Grad Max: 0.010717 -> Layer: exit2_layers.3.bias | Grad Mean: 0.037690 | Grad Max: 0.194696 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000051 | Grad Max: 0.000625 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007637 | Grad Max: 0.016299 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000437 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002002 | Grad Max: 0.006239 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000871 | Grad Max: 0.002774 -> Layer: exit2_layers.12.bias | Grad Mean: 0.026312 | Grad Max: 0.026312 [GRADIENT NORM TOTAL] 11.2040 >>> [TRAIN] BATCH 2 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5037876 0.49621245] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.091 [MASKS] A(Pass/Fail): 717/1331 | B: 676/1372 | C: 717/1331 [LOSS Ex1] A: 0.62125 | B: 0.60390 | C: 0.59999 [LOGITS Ex2 A] Mean Abs: 2.127 | Max: 7.528 [LOSS Ex2] A: 0.10465 | B: 0.32090 | C: 0.18697 ** [JOINT LOSS] ** : 0.812557 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004944 | Grad Max: 0.206374 -> Layer: shared_layers.0.bias | Grad Mean: 0.564062 | Grad Max: 2.790831 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002202 | Grad Max: 0.006463 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007453 | Grad Max: 0.007453 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003466 | Grad Max: 0.508147 -> Layer: exit2_layers.0.bias | Grad Mean: 0.064277 | Grad Max: 2.808617 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000333 | Grad Max: 0.010766 -> Layer: exit2_layers.3.bias | Grad Mean: 0.036592 | Grad Max: 0.185497 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000575 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007327 | Grad Max: 0.016097 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000441 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001880 | Grad Max: 0.006676 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000770 | Grad Max: 0.002800 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023972 | Grad Max: 0.023972 [GRADIENT NORM TOTAL] 12.2744 >>> [TRAIN] BATCH 3 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51225144 0.48774853] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 665/1383 | C: 743/1305 [LOSS Ex1] A: 0.62021 | B: 0.60415 | C: 0.60071 [LOGITS Ex2 A] Mean Abs: 2.100 | Max: 6.858 [LOSS Ex2] A: 0.10946 | B: 0.31718 | C: 0.22692 ** [JOINT LOSS] ** : 0.826208 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.003831 | Grad Max: 0.207102 -> Layer: shared_layers.0.bias | Grad Mean: 0.523785 | Grad Max: 2.536822 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002181 | Grad Max: 0.005386 -> Layer: exit1_layers.0.bias | Grad Mean: 0.004883 | Grad Max: 0.004883 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003145 | Grad Max: 0.386604 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058194 | Grad Max: 2.134387 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000325 | Grad Max: 0.011181 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035342 | Grad Max: 0.186598 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000484 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006967 | Grad Max: 0.013709 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000407 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001868 | Grad Max: 0.006336 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000835 | Grad Max: 0.002625 -> Layer: exit2_layers.12.bias | Grad Mean: 0.025905 | Grad Max: 0.025905 [GRADIENT NORM TOTAL] 10.9855 >>> [TRAIN] BATCH 4 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004027 0.49959725] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 674/1374 | C: 732/1316 [LOSS Ex1] A: 0.62925 | B: 0.59966 | C: 0.59559 [LOGITS Ex2 A] Mean Abs: 2.062 | Max: 5.513 [LOSS Ex2] A: 0.10482 | B: 0.29870 | C: 0.20505 ** [JOINT LOSS] ** : 0.811026 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005890 | Grad Max: 0.164258 -> Layer: shared_layers.0.bias | Grad Mean: 0.503021 | Grad Max: 2.246016 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002080 | Grad Max: 0.005647 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007750 | Grad Max: 0.007750 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003266 | Grad Max: 0.393191 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060258 | Grad Max: 2.186040 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000335 | Grad Max: 0.010196 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035632 | Grad Max: 0.176351 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000048 | Grad Max: 0.000620 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007243 | Grad Max: 0.015093 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000432 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001874 | Grad Max: 0.006595 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000820 | Grad Max: 0.003027 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024537 | Grad Max: 0.024537 [GRADIENT NORM TOTAL] 10.7660 >>> [TRAIN] BATCH 5 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.169 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.143 | Max: 0.961 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.54094696 0.45905304] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.563 | Std: 0.085 [MASKS] A(Pass/Fail): 689/1359 | B: 626/1230 | C: 727/1321 [LOSS Ex1] A: 0.63013 | B: 0.60360 | C: 0.60427 [LOGITS Ex2 A] Mean Abs: 2.074 | Max: 6.300 [LOSS Ex2] A: 0.10544 | B: 0.30236 | C: 0.23467 ** [JOINT LOSS] ** : 0.826829 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.005563 | Grad Max: 0.181007 -> Layer: shared_layers.0.bias | Grad Mean: 0.505638 | Grad Max: 2.351692 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002025 | Grad Max: 0.005500 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000525 | Grad Max: 0.000525 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003151 | Grad Max: 0.461855 -> Layer: exit2_layers.0.bias | Grad Mean: 0.058379 | Grad Max: 2.534984 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000330 | Grad Max: 0.011852 -> Layer: exit2_layers.3.bias | Grad Mean: 0.035496 | Grad Max: 0.204254 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000047 | Grad Max: 0.000527 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007135 | Grad Max: 0.014628 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000021 | Grad Max: 0.000449 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001902 | Grad Max: 0.006345 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000795 | Grad Max: 0.002543 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024851 | Grad Max: 0.024851 [GRADIENT NORM TOTAL] 10.7856 >>> [TRAIN] BATCH 6 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.175 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.157 | Max: 1.209 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.8331859 0.1668141] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.570 | Std: 0.090 [MASKS] A(Pass/Fail): 745/1303 | B: 676/1372 | C: 699/1349 [LOSS Ex1] A: 0.62344 | B: 0.60390 | C: 0.60141 [LOGITS Ex2 A] Mean Abs: 2.120 | Max: 7.050 [LOSS Ex2] A: 0.08966 | B: 0.32070 | C: 0.21334 ** [JOINT LOSS] ** : 0.817487 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004892 | Grad Max: 0.219203 -> Layer: shared_layers.0.bias | Grad Mean: 0.576086 | Grad Max: 2.925282 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002068 | Grad Max: 0.005427 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007062 | Grad Max: 0.007062 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003501 | Grad Max: 0.485263 -> Layer: exit2_layers.0.bias | Grad Mean: 0.065402 | Grad Max: 2.698235 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000351 | Grad Max: 0.010652 -> Layer: exit2_layers.3.bias | Grad Mean: 0.038217 | Grad Max: 0.191218 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000049 | Grad Max: 0.000602 -> Layer: exit2_layers.6.bias | Grad Mean: 0.007585 | Grad Max: 0.016663 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000022 | Grad Max: 0.000481 -> Layer: exit2_layers.9.bias | Grad Mean: 0.002004 | Grad Max: 0.006927 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000881 | Grad Max: 0.003071 -> Layer: exit2_layers.12.bias | Grad Mean: 0.027119 | Grad Max: 0.027119 [GRADIENT NORM TOTAL] 12.4205 >>> [TRAIN] BATCH 7 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.170 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5010807 0.4989193] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.089 [MASKS] A(Pass/Fail): 726/1322 | B: 665/1383 | C: 712/1336 [LOSS Ex1] A: 0.63079 | B: 0.60415 | C: 0.60024 [LOGITS Ex2 A] Mean Abs: 2.125 | Max: 6.172 [LOSS Ex2] A: 0.09164 | B: 0.31723 | C: 0.21131 ** [JOINT LOSS] ** : 0.818454 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004494 | Grad Max: 0.175036 -> Layer: shared_layers.0.bias | Grad Mean: 0.473940 | Grad Max: 2.295432 -> Layer: exit1_layers.0.weight | Grad Mean: 0.001995 | Grad Max: 0.004830 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000634 | Grad Max: 0.000634 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002939 | Grad Max: 0.399060 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053932 | Grad Max: 2.203146 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000289 | Grad Max: 0.010006 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031327 | Grad Max: 0.160608 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000041 | Grad Max: 0.000483 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006300 | Grad Max: 0.013070 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000018 | Grad Max: 0.000356 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001653 | Grad Max: 0.005524 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000719 | Grad Max: 0.002710 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022183 | Grad Max: 0.022183 [GRADIENT NORM TOTAL] 10.0767 >>> [TRAIN] BATCH 8 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.151 | Max: 0.945 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.7557064 0.24429356] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.567 | Std: 0.090 [MASKS] A(Pass/Fail): 713/1335 | B: 674/1374 | C: 713/1335 [LOSS Ex1] A: 0.62568 | B: 0.59966 | C: 0.59937 [LOGITS Ex2 A] Mean Abs: 2.093 | Max: 6.156 [LOSS Ex2] A: 0.11825 | B: 0.29279 | C: 0.20571 ** [JOINT LOSS] ** : 0.813824 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.006023 | Grad Max: 0.177968 -> Layer: shared_layers.0.bias | Grad Mean: 0.500950 | Grad Max: 2.252007 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002101 | Grad Max: 0.005247 -> Layer: exit1_layers.0.bias | Grad Mean: 0.007556 | Grad Max: 0.007556 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003097 | Grad Max: 0.395739 -> Layer: exit2_layers.0.bias | Grad Mean: 0.057109 | Grad Max: 2.157021 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000318 | Grad Max: 0.009114 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033829 | Grad Max: 0.170937 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000570 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006807 | Grad Max: 0.014402 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000381 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001797 | Grad Max: 0.006181 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000790 | Grad Max: 0.002967 -> Layer: exit2_layers.12.bias | Grad Mean: 0.023809 | Grad Max: 0.023809 [GRADIENT NORM TOTAL] 10.4239 >>> [TRAIN] BATCH 9 START <<< [DATA A] Shape: torch.Size([1616, 32]) | Mean: 0.052 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.101 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.6482838 0.3517162] | Indices: [0 1] | Label Real: 0 [CONFIDENCE A] Mean: 0.569 | Std: 0.090 [MASKS] A(Pass/Fail): 588/1028 | B: 626/1230 | C: 741/1307 [LOSS Ex1] A: 0.62377 | B: 0.60360 | C: 0.59659 [LOGITS Ex2 A] Mean Abs: 2.148 | Max: 10.256 [LOSS Ex2] A: 0.10247 | B: 0.30266 | C: 0.19399 ** [JOINT LOSS] ** : 0.807691 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004910 | Grad Max: 0.157883 -> Layer: shared_layers.0.bias | Grad Mean: 0.466297 | Grad Max: 2.143766 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002146 | Grad Max: 0.005608 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000444 | Grad Max: 0.000444 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002949 | Grad Max: 0.367787 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053999 | Grad Max: 2.035715 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000305 | Grad Max: 0.008992 -> Layer: exit2_layers.3.bias | Grad Mean: 0.033028 | Grad Max: 0.171522 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000044 | Grad Max: 0.000502 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006718 | Grad Max: 0.013898 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000431 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001747 | Grad Max: 0.006530 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000722 | Grad Max: 0.002861 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022394 | Grad Max: 0.022394 [GRADIENT NORM TOTAL] 9.8129 >>> [TRAIN] BATCH 10 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.053 | Std: 0.173 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.156 | Max: 1.344 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50850874 0.49149126] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.569 | Std: 0.091 [MASKS] A(Pass/Fail): 729/1319 | B: 676/1372 | C: 738/1310 [LOSS Ex1] A: 0.62457 | B: 0.60390 | C: 0.59436 [LOGITS Ex2 A] Mean Abs: 2.149 | Max: 10.342 [LOSS Ex2] A: 0.09548 | B: 0.32438 | C: 0.20165 ** [JOINT LOSS] ** : 0.814785 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004997 | Grad Max: 0.190291 -> Layer: shared_layers.0.bias | Grad Mean: 0.522649 | Grad Max: 2.510825 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002158 | Grad Max: 0.005315 -> Layer: exit1_layers.0.bias | Grad Mean: 0.000801 | Grad Max: 0.000801 -> Layer: exit2_layers.0.weight | Grad Mean: 0.003266 | Grad Max: 0.480216 -> Layer: exit2_layers.0.bias | Grad Mean: 0.060552 | Grad Max: 2.650548 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000319 | Grad Max: 0.009269 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034541 | Grad Max: 0.173877 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000045 | Grad Max: 0.000517 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006964 | Grad Max: 0.013582 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000405 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001839 | Grad Max: 0.006038 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000779 | Grad Max: 0.002799 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024194 | Grad Max: 0.024194 [GRADIENT NORM TOTAL] 11.2860 >>> [TRAIN] BATCH 11 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.052 | Std: 0.171 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.155 | Max: 1.258 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.50378674 0.4962133 ] | Indices: [0 1] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.091 [MASKS] A(Pass/Fail): 717/1331 | B: 665/1383 | C: 697/1351 [LOSS Ex1] A: 0.62125 | B: 0.60415 | C: 0.60354 [LOGITS Ex2 A] Mean Abs: 2.126 | Max: 7.049 [LOSS Ex2] A: 0.09714 | B: 0.31972 | C: 0.22355 ** [JOINT LOSS] ** : 0.823120 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004298 | Grad Max: 0.165555 -> Layer: shared_layers.0.bias | Grad Mean: 0.472334 | Grad Max: 2.113328 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002132 | Grad Max: 0.005500 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002926 | Grad Max: 0.002926 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002889 | Grad Max: 0.377320 -> Layer: exit2_layers.0.bias | Grad Mean: 0.053513 | Grad Max: 2.095467 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000293 | Grad Max: 0.009286 -> Layer: exit2_layers.3.bias | Grad Mean: 0.031812 | Grad Max: 0.158892 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000042 | Grad Max: 0.000462 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006425 | Grad Max: 0.012741 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000401 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001717 | Grad Max: 0.006114 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000799 | Grad Max: 0.002788 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024305 | Grad Max: 0.024305 [GRADIENT NORM TOTAL] 9.7884 >>> [TRAIN] BATCH 12 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.051 | Std: 0.172 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.154 | Max: 1.297 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.51225185 0.48774815] | Indices: [1 0] | Label Real: 1 [CONFIDENCE A] Mean: 0.568 | Std: 0.090 [MASKS] A(Pass/Fail): 724/1324 | B: 674/1374 | C: 747/1301 [LOSS Ex1] A: 0.62021 | B: 0.59966 | C: 0.59277 [LOGITS Ex2 A] Mean Abs: 2.090 | Max: 6.000 [LOSS Ex2] A: 0.11596 | B: 0.28734 | C: 0.19064 ** [JOINT LOSS] ** : 0.802194 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004727 | Grad Max: 0.175514 -> Layer: shared_layers.0.bias | Grad Mean: 0.460077 | Grad Max: 2.239958 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002211 | Grad Max: 0.006074 -> Layer: exit1_layers.0.bias | Grad Mean: 0.005551 | Grad Max: 0.005551 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002800 | Grad Max: 0.329440 -> Layer: exit2_layers.0.bias | Grad Mean: 0.051954 | Grad Max: 1.797398 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000303 | Grad Max: 0.011107 -> Layer: exit2_layers.3.bias | Grad Mean: 0.032540 | Grad Max: 0.190444 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000043 | Grad Max: 0.000468 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006503 | Grad Max: 0.013475 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000019 | Grad Max: 0.000404 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001715 | Grad Max: 0.006109 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000726 | Grad Max: 0.002940 -> Layer: exit2_layers.12.bias | Grad Mean: 0.022419 | Grad Max: 0.022419 [GRADIENT NORM TOTAL] 9.5748 >>> [TRAIN] BATCH 13 START <<< [DATA A] Shape: torch.Size([2048, 32]) | Mean: 0.050 | Std: 0.167 | Min: 0.000 | Max: 1.000 [LOGITS Ex1 A] Mean Abs: 0.147 | Max: 1.073 [SAMPLE 0 PREDICTION A] Top2 Probs: [0.5004028 0.49959713] | Indices: [1 0] | Label Real: 0 [CONFIDENCE A] Mean: 0.565 | Std: 0.089 [MASKS] A(Pass/Fail): 686/1362 | B: 626/1230 | C: 520/856 [LOSS Ex1] A: 0.62925 | B: 0.60360 | C: 0.59576 [LOGITS Ex2 A] Mean Abs: 2.088 | Max: 5.832 [LOSS Ex2] A: 0.10364 | B: 0.29695 | C: 0.20532 ** [JOINT LOSS] ** : 0.811507 [GRADIENTS CHECK] -> Layer: shared_layers.0.weight | Grad Mean: 0.004850 | Grad Max: 0.162666 -> Layer: shared_layers.0.bias | Grad Mean: 0.458330 | Grad Max: 2.079142 -> Layer: exit1_layers.0.weight | Grad Mean: 0.002077 | Grad Max: 0.005857 -> Layer: exit1_layers.0.bias | Grad Mean: 0.002294 | Grad Max: 0.002294 -> Layer: exit2_layers.0.weight | Grad Mean: 0.002962 | Grad Max: 0.334687 -> Layer: exit2_layers.0.bias | Grad Mean: 0.054633 | Grad Max: 1.851790 -> Layer: exit2_layers.3.weight | Grad Mean: 0.000316 | Grad Max: 0.011321 -> Layer: exit2_layers.3.bias | Grad Mean: 0.034012 | Grad Max: 0.192770 -> Layer: exit2_layers.6.weight | Grad Mean: 0.000046 | Grad Max: 0.000492 -> Layer: exit2_layers.6.bias | Grad Mean: 0.006939 | Grad Max: 0.014465 -> Layer: exit2_layers.9.weight | Grad Mean: 0.000020 | Grad Max: 0.000426 -> Layer: exit2_layers.9.bias | Grad Mean: 0.001832 | Grad Max: 0.006598 -> Layer: exit2_layers.12.weight | Grad Mean: 0.000777 | Grad Max: 0.002914 -> Layer: exit2_layers.12.bias | Grad Mean: 0.024053 | Grad Max: 0.024053 [GRADIENT NORM TOTAL] 9.7263 [EPOCH SUMMARY] Train Loss: 0.8148 [VALIDATION] Starting... [VAL] Processando primeiro batch de validação... [EPOCH END] Val Loss: 0.8012 | Alpha: 0.5500 No improve count: 15/15 EARLY STOPPING TRIGGERED
Modelo treinado e salvo em 'models/teste_ljoint9.pth'
In [46]:
model.load_state_dict(torch.load(f'models/{modelname}.pth'))
print(f"Modelo 'models/{modelname}.pth' carregado\n")
print(f"Base: UNSW")
results = evaluate_model(model, test_loaders[0], limiar, device=device)
print("-" * 20)
print(f" Accuracy: {results['accuracy']:.4f}%")
print(f" Avg. Inference Time: {results['avg_inference_time_ms']:.4f} ms")
print(f" Early Exit Rate: {results['exit_rate']:.4f}% ({results['exited_early_count']}/{results['total_samples']})")
print("-" * 20)
print(f"\nBase: BOT")
results = evaluate_model(model, test_loaders[1], limiar, device=device)
print("-" * 20)
print(f" Accuracy: {results['accuracy']:.4f}%")
print(f" Avg. Inference Time: {results['avg_inference_time_ms']:.4f} ms")
print(f" Early Exit Rate: {results['exit_rate']:.4f}% ({results['exited_early_count']}/{results['total_samples']})")
print("-" * 20)
print(f"\nBase: CIC")
results = evaluate_model(model, test_loaders[2], limiar, device=device)
print("-" * 20)
print(f" Accuracy: {results['accuracy']:.4f}%")
print(f" Avg. Inference Time: {results['avg_inference_time_ms']:.4f} ms")
print(f" Early Exit Rate: {results['exit_rate']:.4f}% ({results['exited_early_count']}/{results['total_samples']})")
print("-" * 20)
Modelo 'models/teste_ljoint9.pth' carregado Base: UNSW
True Positives (TP): 7440 True Negatives (TN): 8804 False Positives (FP): 196 False Negatives (FN): 1560 F1 Score: 0.8944 True Positive Rate (TPR) / Recall: 0.8267 True Negative Rate (TNR) / Specificity: 0.9782 -------------------- Accuracy: 90.2444% Avg. Inference Time: 0.0025 ms Early Exit Rate: 35.0222% (6304/18000) -------------------- Base: BOT
True Positives (TP): 2907 True Negatives (TN): 3636 False Positives (FP): 364 False Negatives (FN): 1093 F1 Score: 0.7996 True Positive Rate (TPR) / Recall: 0.7268 True Negative Rate (TNR) / Specificity: 0.9090 -------------------- Accuracy: 81.7875% Avg. Inference Time: 0.0025 ms Early Exit Rate: 32.9500% (2636/8000) -------------------- Base: CIC
True Positives (TP): 12647 True Negatives (TN): 13591 False Positives (FP): 409 False Negatives (FN): 1353 F1 Score: 0.9349 True Positive Rate (TPR) / Recall: 0.9034 True Negative Rate (TNR) / Specificity: 0.9708 -------------------- Accuracy: 93.7071% Avg. Inference Time: 0.0023 ms Early Exit Rate: 35.5750% (9961/28000) --------------------
In [ ]:
In [ ]: